Best Practices for Parallel implementation of Gibbs Sampler algorithm on single machine

deburgess · August 20, 2020, 1:52pm

I am re-posting in this section as it seems a more relevant forum thant “Julia at Scale”. Thanks in advance for your comments and advice.

Would someone comment on how to improve my coding? In particular:

did I use @sync properly?
Did I use Distributed arrays efficiently?Would someone comment on how to improve my coding?
How does random number generation work on parallel cores? Should each core be started with its own seed?

using Gibbs:ConvertToNumber,ConvertToLetter,CalcFreq
@everywhere using Gibbs:GibbsSampler,CalcRelEntropy

@everywhere using DistributedArrays

@everywhere using Distributions

ks = ARGS[1]
Ns = ARGS[2]
const k = parse(Int,ks)
const N = parse(Int,Ns)

Dna = readlines("Dna.txt")
const t = length(Dna)

const Dvec = map(ConvertToNumber, Dna);
const lStrand = length(Dvec[1]);
const bFreq = sum(CalcFreq(Dvec),dims=2) / (t*lStrand);

MvecInit = []

dBestScores = distribute(zeros(nworkers()))

mMinit = Array{Array{Int64,1},1}[]
for i=1:nworkers()
	Mvec = fill(zeros(Int,k), t)
       	push!(mMinit,Mvec)
end

dBestMvec = distribute(mMinit)

mMinit = nothing

@sync @distributed for i = 1:5760
       Mvec = GibbsSampler(Dvec,MvecInit,k,t,N)
       score = CalcRelEntropy(Mvec,bFreq)
       if (score > dBestScores[:L][1])
       		dBestScores[:L][1] = score
		dBestMvec[:L][1] = Mvec
       end
end

BestScores = convert(Array, dBestScores)
BestMvecs = convert(Array, dBestMvec)

fScore = open("score.txt", "w")
println(fScore,BestScores)
close(fScore)

(bestScore,argmax) = findmax(BestScores)

BestMotifs = map(ConvertToLetter, BestMvecs[argmax]) 

for motif in BestMotifs 
	println(motif)
end

Code for Gibb’s Sampler

# Dvec is an array of integer arrays
function GibbsSampler(Dvec,MvecInit,k,t,N) 
	lStrand = length(Dvec[1]);

	nKmer = lStrand - k + 1; 
	bFreq = sum(CalcFreq(Dvec),dims=2)[:] / (t*lStrand);

	if MvecInit==[]
		# Construct random set of motifs
		Mvec = fill(zeros(Int,k), t)
		for s = 1:t
       			strand=Dvec[s];
			i = rand(1:nKmer);
       			motif = strand[i:i+k-1];
       			Mvec[s] = motif;
       		end
	else
		Mvec = copy(MvecInit)
	end

	score = CalcRelEntropy(Mvec,bFreq)

	BestMvec = copy(Mvec)
	bestScore = score

	for j = 1:N
		i = rand(1:t)
		Profile = CalcProfileLap([Mvec[1:i-1];Mvec[i+1:end]])
		strand = Dvec[i]
		# construct discrete PDF
		Pm = zeros(nKmer)
		for m = 1:nKmer
			kMer = strand[m:m+k-1]
			p = ProbKmerProfile(kMer, Profile, bFreq)
			Pm[m] = p
		end
		norm = sum(Pm)
		Pm /= norm

		m = rand(Categorical(Pm), 1)[1]

		mVec_i = strand[m:m+k-1]

		Mvec = [Mvec[1:i-1];[mVec_i];Mvec[i+1:end]]

		score = CalcRelEntropy(Mvec,bFreq)

		if (score > bestScore)
			BestMvec = copy(Mvec)
			bestScore = score
		end
	end

	return BestMvec
end

Could my implementation of the calculation of base frequencies be optimized for speed? The vector lengths are short (less than a hundred).

function CalcFreq(Mvec)
	t = length(Mvec);
	k = length(Mvec[1]);

	Bin = zeros(Int,4,k);

	for i =1:t
		for j = 1:k 
			Bin[Mvec[i][j],j] += 1;
		end
	end

	return Bin
end

Topic		Replies	Views
Best Practices for Parallel implementation of Gibbs Sampler algorithm Julia at Scale	0	361	August 7, 2020
Need help fixing Gibbs Sampler using DArrays New to Julia	1	276	October 14, 2020
Gibbs sampler works in R doesn't in Julia (full Julia code provided) please help General Usage	22	2480	October 25, 2018
Linear regression Gibbs sampler Probabilistic programming	2	794	July 3, 2021
Trying to understand parallel performance in Turing Statistics turing	6	1649	February 23, 2022

Best Practices for Parallel implementation of Gibbs Sampler algorithm on single machine

Related topics