Thanks in advance for your comments and advice.
Would someone comment on how to improve my coding? In particular:
- did I use @sync properly?
- Did I use Distributed arrays efficiently?
- How does random number generation work on parallel cores? Should each core be started with its own seed?
using Gibbs:ConvertToNumber,ConvertToLetter,CalcFreq
@everywhere using Gibbs:GibbsSampler,CalcRelEntropy
@everywhere using DistributedArrays
@everywhere using Distributions
ks = ARGS[1]
Ns = ARGS[2]
const k = parse(Int,ks)
const N = parse(Int,Ns)
Dna = readlines("Dna.txt")
const t = length(Dna)
const Dvec = map(ConvertToNumber, Dna);
const lStrand = length(Dvec[1]);
const bFreq = sum(CalcFreq(Dvec),dims=2) / (t*lStrand);
#Motifs = readlines("Motifs.txt")
#const MvecInit = map(ConvertToNumber, Motifs);
#Motifs = nothing
MvecInit = []
dBestScores = distribute(zeros(nworkers()))
mMinit = Array{Array{Int64,1},1}[]
for i=1:nworkers()
Mvec = fill(zeros(Int,k), t)
push!(mMinit,Mvec)
end
dBestMvec = distribute(mMinit)
mMinit = nothing
@sync @distributed for i = 1:5760
Mvec = GibbsSampler(Dvec,MvecInit,k,t,N)
score = CalcRelEntropy(Mvec,bFreq)
if (score > dBestScores[:L][1])
dBestScores[:L][1] = score
dBestMvec[:L][1] = Mvec
end
end
BestScores = convert(Array, dBestScores)
BestMvecs = convert(Array, dBestMvec)
fScore = open("score.txt", "w")
println(fScore,BestScores)
close(fScore)
(bestScore,argmax) = findmax(BestScores)
BestMotifs = map(ConvertToLetter, BestMvecs[argmax])
for motif in BestMotifs
println(motif)
end
Could my implementation of the calculation of base frequencies be optimized for speed? The vector lengths are short (less than a hundred).
function CalcFreq(Mvec)
t = length(Mvec);
k = length(Mvec[1]);
Bin = zeros(Int,4,k);
for i =1:t
for j = 1:k
Bin[Mvec[i][j],j] += 1;
end
end
return Bin
end