I am re-posting in this section as it seems a more relevant forum thant “Julia at Scale”. Thanks in advance for your comments and advice.
Would someone comment on how to improve my coding? In particular:
- did I use @sync properly?
- Did I use Distributed arrays efficiently?Would someone comment on how to improve my coding?
- How does random number generation work on parallel cores? Should each core be started with its own seed?
using Gibbs:ConvertToNumber,ConvertToLetter,CalcFreq
@everywhere using Gibbs:GibbsSampler,CalcRelEntropy
@everywhere using DistributedArrays
@everywhere using Distributions
ks = ARGS[1]
Ns = ARGS[2]
const k = parse(Int,ks)
const N = parse(Int,Ns)
Dna = readlines("Dna.txt")
const t = length(Dna)
const Dvec = map(ConvertToNumber, Dna);
const lStrand = length(Dvec[1]);
const bFreq = sum(CalcFreq(Dvec),dims=2) / (t*lStrand);
MvecInit = []
dBestScores = distribute(zeros(nworkers()))
mMinit = Array{Array{Int64,1},1}[]
for i=1:nworkers()
Mvec = fill(zeros(Int,k), t)
push!(mMinit,Mvec)
end
dBestMvec = distribute(mMinit)
mMinit = nothing
@sync @distributed for i = 1:5760
Mvec = GibbsSampler(Dvec,MvecInit,k,t,N)
score = CalcRelEntropy(Mvec,bFreq)
if (score > dBestScores[:L][1])
dBestScores[:L][1] = score
dBestMvec[:L][1] = Mvec
end
end
BestScores = convert(Array, dBestScores)
BestMvecs = convert(Array, dBestMvec)
fScore = open("score.txt", "w")
println(fScore,BestScores)
close(fScore)
(bestScore,argmax) = findmax(BestScores)
BestMotifs = map(ConvertToLetter, BestMvecs[argmax])
for motif in BestMotifs
println(motif)
end
Code for Gibb’s Sampler
# Dvec is an array of integer arrays
function GibbsSampler(Dvec,MvecInit,k,t,N)
lStrand = length(Dvec[1]);
nKmer = lStrand - k + 1;
bFreq = sum(CalcFreq(Dvec),dims=2)[:] / (t*lStrand);
if MvecInit==[]
# Construct random set of motifs
Mvec = fill(zeros(Int,k), t)
for s = 1:t
strand=Dvec[s];
i = rand(1:nKmer);
motif = strand[i:i+k-1];
Mvec[s] = motif;
end
else
Mvec = copy(MvecInit)
end
score = CalcRelEntropy(Mvec,bFreq)
BestMvec = copy(Mvec)
bestScore = score
for j = 1:N
i = rand(1:t)
Profile = CalcProfileLap([Mvec[1:i-1];Mvec[i+1:end]])
strand = Dvec[i]
# construct discrete PDF
Pm = zeros(nKmer)
for m = 1:nKmer
kMer = strand[m:m+k-1]
p = ProbKmerProfile(kMer, Profile, bFreq)
Pm[m] = p
end
norm = sum(Pm)
Pm /= norm
m = rand(Categorical(Pm), 1)[1]
mVec_i = strand[m:m+k-1]
Mvec = [Mvec[1:i-1];[mVec_i];Mvec[i+1:end]]
score = CalcRelEntropy(Mvec,bFreq)
if (score > bestScore)
BestMvec = copy(Mvec)
bestScore = score
end
end
return BestMvec
end
Could my implementation of the calculation of base frequencies be optimized for speed? The vector lengths are short (less than a hundred).
function CalcFreq(Mvec)
t = length(Mvec);
k = length(Mvec[1]);
Bin = zeros(Int,4,k);
for i =1:t
for j = 1:k
Bin[Mvec[i][j],j] += 1;
end
end
return Bin
end