I have a co-assembly and some alignment data telling me if a reads from a sample that helped spawn that co-assembly align to a contig. I am using this to write a fasta file for that sample.
t1 = Threads.@spawn DataFrame(CSV.File(depth, delim="\t"))
abundf = fetch(t1)
rename!(abundf, [:Contig,:trimmed_mean])
present = @subset!(abundf, :trimmed_mean .!= 0)
passContig = Set(present[:,"Contig"])
Here I am reading in and removing all rows that have a depth of 0 indicating that no reads from that sample aligned to that contig (where each row is a contig in the co assembly)
reader = FASTA.Reader(GzipDecompressorStream(open(assembly))) #if gzipped
# Generating the writer to push the output into
writer = open(FASTA.Writer, "D:/OneDrive - University of Copenhagen/PhD/Projects/Termite_metagenomes/assembly/matriline/bySample/"*sample*".fasta")
println("Writing to file")
# Looping over the contigs
record = FASTA.Record()
while !eof(reader)
read!(reader, record)
if in(FASTA.identifier(record), passContig)
write(writer, record)
end
end
close(reader)
Next I am opening the co-assembly and going over its headers and seeing if that header is in the passContig
list for the given sample.
However the length of passContig
is not equal to the resulting number of reads in the fasta being written. This should not be the case as any contig present in the passContig
is in the co-assembly.
> length(passContig)
121964
whilst the number of reads in the fasta is: 121955 (generated from grep -c ">" sample.fasta
)
Therefore I next tried to identify what contigs were being missed:
reader = FASTA.Reader(GzipDecompressorStream(open(assembly))) #if gzipped
headers= Set([FASTA.identifier(rec) for rec in reader])
close(reader)
for r in passContig
if !(r in headers)
println(r)
elseif r in headers
println("it is here")
end
end
However this results in nothing but “it is here”. Would anyone know why this is happening?