Hi everyone
I am a new user of Julia. I want to parse data from a file with over 50 GB. I have used CSV.file to read a set of data (every 427,905 lines) using skipto=start_m,limit=NumberEffects,ntasks=1.
The file contains 2,567,430,000 lines. With every iteration, the time is doubled. It seems to be the function read all the files again until the position indicated in limit argument. So I don’t know how to read large data and split it more efficient. Could someone suggest some code, commands, o packages?
Code below:
function ComputeVariancePerGeneration(Route_NameOfFile::String, NumberEffects::Int64, Iter::Int64, group_1::Dict, gener::Vector{Int64}, EffectPosition::Int64, called::String="Mutational variance")
MSM = string("You are calculating the elements of posterior marginal distribution of ",called,"\n", "BE CAREFUL writing the arguments","\n")
printstyled(MSM,bold=false,color=:light_yellow)
values_var = var_vector(Dict{Int64, Vector{Float64}}(),Dict{Int64, Float64}())
start_m = 1
end_m = NumberEffects
for i in gener
values_var.vsam_par[i] = Float64[]
values_var.mean_PMD[i] = -1.0
end
for itera in 1:Iter
all_solut = Matrix(CSV.read(Route_NameOfFile,delim=' ',ignorerepeated=true,header=false,skipto=start_m,limit=NumberEffects,ntasks=1,DataFrame))
all_solut = all_solut[all.(==(EffectPosition), eachrow(all_solut[:,2])), :]
for i in gener
current_value = round(var(all_solut[findall(in( values(group_1.Vari_par[i]) ),all_solut[:,3]), [4]]), digits=8)
push!(values_var.vsam_par[i], current_value)
end
start_m = end_m + 1
end_m = start_m + NumberEffects - 1
println(itera)
end
@show last(all_solut)
for m in gener
values_var.mean_PMD[m] = mean(values(values_var.vsam_par[m]))
end
MSM = string("The first element is a vector of variance elements per generation,","\n", "the second element is the mean of variance component parameter per generation codification")
printstyled(MSM,bold=false,color=:light_cyan)
return (values_var.vsam_par,values_var.mean_PMD)
end
route_2 = raw"I:\NARUTO\870-3020\model_AC+Mutat"
Route_NameOfFile= route_2*"/all_solutions2"
NumberEffects = 427905
Iter = 6000
group_1 = member_generat[2]
gener = [0,1,3,4,5,6,7,8]
EffectPosition = 2
called = "additive genetic variance"