Hello,
I am new at programming and hope you guys can give me a hand, please.
I have three plain text files with columns called ID_#
File a
Header
Header
Header
Header
Header
ID_1 ID_5 ID_6 ID_7 ID_8
File b
ID_1 ID_2 ID_3 ID_8 ID_9 ID_10
File c
ID_1 ID_2 ID_3 ID_4
I have the code below. It works but it is extremely slow when the file named a is larger. Any ideas on how to boost the speed?
Thanks
The code is
## Packages and Libraries
using CSV
using DataFrames
using Arrow
using Statistics
using DelimitedFiles
using DataFramesMeta
using StatsBase
using Query
using JLD
using BenchmarkTools
using Profile
##
## Read files
a = CSV.read(
"Path\\File\\a.txt",
DataFrame,
header = 6,
type = String,
select = [1, 2, 3, 4],
footerskip = 1,
missingstrings = ["NA", "-"],
)
b = CSV.read(
"Path\\File\\b.txt",
DataFrame,
missingstrings = ["NA", "-"],
)
c = CSV.read(
"Path\\File\\c.txt",
DataFrame,
missingstrings = ["NA", "-"],
select = [1, 2, 3],
types = Dict(:ID_1 => String, :ID_2 => UInt8, :ID_3 => UInt32),
delim = "\t",
silencewarnings = true
)
##
## Data preprocessing
# Remove missing ID_2s and ID_3s
c = c[completecases(c), :]
# Remove X ID_2 from b, and c
b = b[b[:, :ID_2].!=30, :]
c = c[c[:, :ID_2].!=30, :]
# Merge a with c
a = innerjoin(a, c, on = :SNP_Name => :ID_1)
##
## Function to evaluate whether an ID_2 match between the b's profile and the potential a
function evaluate_ID_2_match(ID_2, potential_a)
## Define parameters
local minimum_profile_segment_length = 0.3 # Proportion of the ID_2 equivalent to minimum length to match.
##
# b's profile for the ID_2
b_ID_2 = b[b[:, :ID_2].==ID_2, :hap]
a_ID_2_ID_6 = potential_a[potential_a[:, :ID_2].==ID_2, :ID_6]
a_ID_2_ID_7 = potential_a[potential_a[:, :ID_2].==ID_2, :ID_7]
# mismatch per ID_1 (mpm). It is true when there is a mismatch between the b's ID_1 and the a's ID_1.
local mpm = Vector{UInt8}
mpm = (b_ID_2 .!= a_ID_2_ID_6) .& (b_ID_2 .!= a_ID_2_ID_7)
# add a true at the end, and end point pot nmr in case there is no any other.
mpm = push!(mpm, true)
# remove missing from mpm
mpm = mpm[findall(!ismissing, mpm)]
# Length of the b's profile for the ID_2
local len_b_hap::UInt32 = length(mpm)
# Last ID_1 to start counting nmr
local last_ID_1_nmr::UInt32 = ceil(Int, len_b_hap * (1-minimum_profile_segment_length))
# Minimum nmr to match the b's profile with the a
minimum_nmr = (len_b_hap * minimum_profile_segment_length) + 1
# Evaluate whether there is any nmr grater than or equal to the minimum nmr to consider the a match the b's profile
for ID_1::UInt32 = 1:last_ID_1_nmr
local nmr::UInt32 = findnext(mpm[ID_1:length(mpm)], true)
if nmr >= minimum_nmr
local result_chr::UInt8 = 1
return(result_chr)
break
end
end
end
##
## Analysis of potential a
function analysis_a(a)
local mismatch_a::UInt16 = 0
local match_a::UInt16 = 0
local chr_correct_a_cutoff = 20
local chr_wrong_a_cutoff::UInt8 = 29 - chr_correct_a_cutoff # Wrong a cutoff. After chr_wrong_a_cutoff of autosomes equal to mismatch (zero), then the remaining autosomes do not need to be compared because they cannot result in exceeding the cutoff of chr_correct_a_cutoff
local result_a = UInt8
for ID_2::UInt8 = 1:29
ID_2 = 30 - ID_2 # Start from last ID_2. Example, the first ID_2 to analyse is the 29
if evaluate_ID_2_match(ID_2, a) == nothing
while mismatch_a < chr_wrong_a_cutoff
mismatch_ID_2 = 1
mismatch_a += mismatch_ID_2
result_a = 0
end
else
match_ID_2 = 1
match_a += match_ID_2
if match_a >= chr_correct_a_cutoff
result_a = 1
end
end
end
return(result_a)
end
##
##
# Filter a pre-allocating the array a
function a_filter(potential_a::DataFrames.AbstractDataFrame, a_dataset::DataFrames.AbstractDataFrame, a_ID::AbstractVector{String})
potential_a = a_dataset[ ( a_dataset.Sample_ID .== a_ID ) , :]
potential_a
end
##
## Analysis of potential a
function analysis_group_a(a_dataset = a)
potential_a = DataFrame()
result_a = 0
number_of_potential_a::UInt32 = length(unique(a_dataset[:, :Sample_ID]))
for pot_a::UInt32 = 1:number_of_potential_a
a_ID = unique(a_dataset[:, :Sample_ID])[[pot_a]]
analysis_a(a_filter(potential_a, a_dataset, a_ID)) == 1 && return print(a_ID)
end
end
##
## End module
end
##
## Check the speed
using BenchmarkTools
using Profile
using Pkg
using ProfileView
time a_module.analysis_group_a()
benchmark a_module.analysis_group_a()
profview a_module.analysis_group_a()
##```