# Best Practices for Parallel implementation of Gibbs Sampler algorithm on single machine

I am re-posting in this section as it seems a more relevant forum thant “Julia at Scale”. Thanks in advance for your comments and advice.

Would someone comment on how to improve my coding? In particular:

1. did I use @sync properly?
2. Did I use Distributed arrays efficiently?Would someone comment on how to improve my coding?
3. How does random number generation work on parallel cores? Should each core be started with its own seed?
``````using Gibbs:ConvertToNumber,ConvertToLetter,CalcFreq
@everywhere using Gibbs:GibbsSampler,CalcRelEntropy

@everywhere using DistributedArrays

@everywhere using Distributions

ks = ARGS[1]
Ns = ARGS[2]
const k = parse(Int,ks)
const N = parse(Int,Ns)

const t = length(Dna)

const Dvec = map(ConvertToNumber, Dna);
const lStrand = length(Dvec[1]);
const bFreq = sum(CalcFreq(Dvec),dims=2) / (t*lStrand);

MvecInit = []

dBestScores = distribute(zeros(nworkers()))

mMinit = Array{Array{Int64,1},1}[]
for i=1:nworkers()
Mvec = fill(zeros(Int,k), t)
push!(mMinit,Mvec)
end

dBestMvec = distribute(mMinit)

mMinit = nothing

@sync @distributed for i = 1:5760
Mvec = GibbsSampler(Dvec,MvecInit,k,t,N)
score = CalcRelEntropy(Mvec,bFreq)
if (score > dBestScores[:L][1])
dBestScores[:L][1] = score
dBestMvec[:L][1] = Mvec
end
end

BestScores = convert(Array, dBestScores)
BestMvecs = convert(Array, dBestMvec)

fScore = open("score.txt", "w")
println(fScore,BestScores)
close(fScore)

(bestScore,argmax) = findmax(BestScores)

BestMotifs = map(ConvertToLetter, BestMvecs[argmax])

for motif in BestMotifs
println(motif)
end
``````

Code for Gibb’s Sampler

``````# Dvec is an array of integer arrays
function GibbsSampler(Dvec,MvecInit,k,t,N)
lStrand = length(Dvec[1]);

nKmer = lStrand - k + 1;
bFreq = sum(CalcFreq(Dvec),dims=2)[:] / (t*lStrand);

if MvecInit==[]
# Construct random set of motifs
Mvec = fill(zeros(Int,k), t)
for s = 1:t
strand=Dvec[s];
i = rand(1:nKmer);
motif = strand[i:i+k-1];
Mvec[s] = motif;
end
else
Mvec = copy(MvecInit)
end

score = CalcRelEntropy(Mvec,bFreq)

BestMvec = copy(Mvec)
bestScore = score

for j = 1:N
i = rand(1:t)
Profile = CalcProfileLap([Mvec[1:i-1];Mvec[i+1:end]])
strand = Dvec[i]
# construct discrete PDF
Pm = zeros(nKmer)
for m = 1:nKmer
kMer = strand[m:m+k-1]
p = ProbKmerProfile(kMer, Profile, bFreq)
Pm[m] = p
end
norm = sum(Pm)
Pm /= norm

m = rand(Categorical(Pm), 1)[1]

mVec_i = strand[m:m+k-1]

Mvec = [Mvec[1:i-1];[mVec_i];Mvec[i+1:end]]

score = CalcRelEntropy(Mvec,bFreq)

if (score > bestScore)
BestMvec = copy(Mvec)
bestScore = score
end
end

return BestMvec
end

``````

Could my implementation of the calculation of base frequencies be optimized for speed? The vector lengths are short (less than a hundred).

``````function CalcFreq(Mvec)
t = length(Mvec);
k = length(Mvec[1]);

Bin = zeros(Int,4,k);

for i =1:t
for j = 1:k
Bin[Mvec[i][j],j] += 1;
end
end

return Bin
end
``````
1 Like