Hi, I’m not use to write down question but I’ve long search on google and I can’t find answer to my problem. I trying to implement the algorithm of sugimoto et al concerning Compressive Bilateral Filter (**DOI:** 10.1109/TIP.2015.2442916) using their O(1) gaussian filter (**DOI:** 10.1109/ICIP.2013.6738106). I succeded to implement it and I’m now trying to optimized it using distributed loops, but when I run the @distributed version I only got a zeros arrays as results.

As I forecast this might not be clear here is the code I use for parallele and serial loop.

The O(1) gaussian filter: (for the non distributed version just remove the @distributed)

```
function Fastgauss(img::Array{Float64}, σ::Union{Float64,Array{Float64}}, K::Int64=3)
# Derivation of the Gaussian spatial kernel error
function Dkernelerror(R::Int64, σ::Float64)
ϕ = (2*R+1)/(2*σ)
ψ = π*σ*(2*K+1)/(2*R+1)
# derivative of Es(R)+Ef(K,R) = erfc(ϕ)+(erfc(ψ)-erfc(πσ)) wrt R
return (sqrt(π)*σ*(2*K+1)*exp(-ψ*ψ))/(R*R+R+0.25)-(2*exp(-ϕ*ϕ))/(sqrt(π)*σ)
end
# Binary search
low = Int64(floor(2*σ))
up = Int64(ceil(4*σ))
while up > low+1
mid = Int64(round((low+up)/2))
if Dkernelerror(mid, σ) < 0
low = mid
else
up = mid
end
end
if abs(Dkernelerror(low, σ)) <= abs(Dkernelerror(up, σ))
R = low
else
R = up
end
# kernel min size = 3
if R < 3
R = 3
end
# Filter for one dimension
function filter1d(f::Array{Float64,1}, σ::Float64, R::Int64, K::Int64)
# output
fo = zeros(length(f))
# reflecting padding of f
f = [f[(R+1):-1:2];f;f[(end-1):-1:(end-R-1)]]
# precompute γkcos(ϕku) and G0
ϕ = 2*π/(2*R+1)
G0 = 1/(2*R+1)
coeff = fill(Float64[],2)
coeff[1] = 2 .*cos.(ϕ.*(1:K))
coeff[2] = 2 .*exp.((-0.5*σ*σ*ϕ*ϕ).*(1:K).*(1:K)).*cos.((ϕ*R).*(1:K))
F0 = sum(f[1:(2*R+1)])
Zprev = zeros(K)
Z = zeros(K)
for k in 1:K
γcos = (2*exp(-0.5*σ*σ*ϕ*ϕ*k*k)).*cos.((ϕ*k).*(-R:R))
Zprev[k] = sum(γcos.*f[1:(2*R+1)])
Z[k] = sum(γcos.*f[2:(2*R+2)])
end
fo[1] = G0*(F0+sum(Zprev))
F0 += f[2*R+2]-f[1]
for x in 1:(length(fo)-1)
fo[x+1] = G0*(F0+sum(Z))
# F0(x+1) = F0(x) + f(x+R+1) - f(x-R)
F0 += f[2*R+2+x]-f[x+1]
# Δ = f(x+R+1) - f(x+R) - f(x-R) + f(x-R-1)
Δ = f[2*R+2+x]-f[2*R+1+x]-f[x+1]+f[x]
for k in 1:K
ξ = coeff[1][k]*Z[k]-Zprev[k]+coeff[2][k]*Δ
Zprev[k] = Z[k]
Z[k] = ξ
end
end
return fo
end
# apply filter1d in each dimension
sz = size(img)
# output image
imgo = zeros(sz)
# filter x
@distributed for y in 1:sz[1]
imgo[y,:] = filter1d(img[y,:],σ,R,K)
end
# filter y
@distributed for x in 1:sz[2]
imgo[:,x] = filter1d(imgo[:,x],σ,R,K)
end
return imgo
end
```

And the CompressIve Bilateral Filter :

```
function CBLF(img::Array{Float64}, guide::Array{Float64}, σs::Union{Float64,Array{Float64}}, σr::Float64, τ::Float64=0.005)
@eval using SpecialFunctions
# normalize tone of img and guide to [0,1]
img = (img.-minimum(img[:]))./(maximum(img[:])-minimum(img[:]))
guide = (guide.-minimum(guide[:]))./(maximum(guide[:])-minimum(guide[:]))
# optimize for K
ξ = erfcinv(τ*τ)
K = Int64(ceil(ξ*ξ/(2*π)+ξ/(2*π*σr)-0.5))
function Dkernelerror(T::Float64)
ϕ = (π*σr*(2*K+1))/T
ψ = (T-1.0)/σr
# derivative of E(K,T) = erfc(ϕ)+erfc(ψ)
return ((2*sqrt(π)*σr*(2*K+1))/(T*T))*exp(-ϕ*ϕ)-(2/(sqrt(π)*σr))*exp(-ψ*ψ)
end
# Binary search
# extend search domain by 0.03
low = σr*ξ+1.0-0.03
up = (π*σr*(2*K+1))/ξ+0.03
# loop 10times to find T
for i in 1:10
mid = (low+up)/2
if Dkernelerror(mid) < 0
low = mid
else
up = mid
end
end
T = (low+up)/2
b0 = ones(size(img))
b = Fastgauss(img,σs)
for k in 1:K
ω = 2*π*k/T
ak = 2*exp(-0.5*ω*ω*σr*σr)
c = cos.(ω.*guide)
s = sin.(ω.*guide)
Ψc = Fastgauss(c,σs)
Ψs = Fastgauss(s,σs)
b0 += ak.*(c.*Ψc.+s.*Ψs)
Ψc = Fastgauss(c.*img,σs)
Ψs = Fastgauss(s.*img,σs)
b += ak.*(c.*Ψc.+s.*Ψs)
end
return b./b0
end
```

For the sake of exemple I use the “lighthouse” from TestImages as it:

```
img = convert(Array{Float64},Gray.(testimage("lighthouse")));
```

When I run Fastgauss on this image I get a major improvement in computation time using @distributed loops : (using BenchmarkTools)

```
# distributed loops
@btime simg = Fastgauss(img,3.0);
634.200 μs (18 allocations: 3.00 MiB)
# serial loops
@btime simg = Fastgauss(img,3.0);
20.775 ms (34562 allocations: 27.54 MiB)
```

The result for Fastgauss in @distributed and serial version is the same. But even if it provide an improvement in running time the use of @distributed version of Fastgauss in CBLF produce a bad output (a zeros array).

```
# distributed loops
@btime fimg = CBLF(img,img,10.0,0.2);
83.776 ms (1318 allocations: 177.07 MiB)
# serial loops
@btime fimg = CBLF(img,img,10.0,0.2);
523.033 ms (588378 allocations: 738.37 MiB)
```

(I’m not able to add more than one picture but I can upload the 2 different output of CBLF on demande). Moreover I also notice that when I use btime on the @ditributed version it takes a long time to write down the execution time (~2/3min) and the memory consuption explode (~15Gb of RAM).

However the serial version of Fastgauss use by CLBF produce the desired output and I don’t get why the parallel one don’t ?

Thanks in advance for your help,

Geoffrey