Hi,
Thank you very much for looking into my code. Below is my response for your comments.
Yes, indeed. So, below is a simiulated data that you could use to test my code. Please let me know whether it works for you.
# test data
using LinearAlgebra, SparseArrays, LazyArrays
v = 580;
q = 43416;
r = 18;
c = 36;
Nr = (1/c).*sprandn(v, r, 0.20);
Nc = (1/r).*sprandn(v, c, 0.20);
valSwaA = [rand(1:r,q) rand(1:c,q)];
valSwaB = [rand(1:r,q) rand(1:c,q)];
valSwaA = [CartesianIndex(valSwaA[ii,1],valSwaA[ii,2]) for ii in 1:q];
valSwaB = [CartesianIndex(valSwaB[ii,1],valSwaB[ii,2]) for ii in 1:q];
Wr = spzeros(Float64,v,q);
Wc = spzeros(Float64,v,q);
br = fill((1/c)*2,q)
bc = fill((1/r)*2,q)
nq = collect(1:q);
tA = rand(1:v,q)
tB = rand(1:v,q)
Then, you could run myfun3 as follows:
myfun3(nq, v, r, c, Nr, Nc,
valSwaA, valSwaB,
tA, tB,
Wr, Wc,
br, bc);
Based on your thoughts, I implemented a new version of my function as follows
# I used your suggestion 2, where three mutable structures are defined
# (i.e., input data, processed data in a function, and output data)
# input: temporary input data
mutable struct temIntStr
nq::Array{Int64,1}
v::Int64
r::Int64
c::Int64
Nr::SparseMatrixCSC{Float64,Int64}
Nc::SparseMatrixCSC{Float64,Int64}
valSwaA::Array{Int64,2}
valSwaB::Array{Int64,2}
tA::Array{Int64,1}
tB::Array{Int64,1}
Wr::SparseMatrixCSC{Float64,Int64}
Wc::SparseMatrixCSC{Float64,Int64}
br::Array{Float64,1}
bc::Array{Float64,1}
temIntStr(nq,v,r,c,Nr,Nc,valSwaA,valSwaB,tA,tB,Wr,Wc,br,bc) = new(nq,v,r,c,Nr,Nc,valSwaA,valSwaB,tA,tB,Wr,Wc,br,bc)
end
# processed data: temporary structure
mutable struct temPro
Δ1::Array{Float64,1}
Δ2::Array{Float64,1}
χ::Array{Float64,1}
wra::Array{Float64,1}
wrb::Array{Float64,1}
wca::Array{Float64,1}
wcb::Array{Float64,1}
temPro(Δ1,Δ2,χ,wra,wrb,wca,wcb) = new(Δ1,Δ2,χ,wra,wrb,wca,wcb)
end
# output: eigenvalues and eigvectors information
mutable struct eigInf
q::Int64
λ1::Array{Float64,1}
λ2::Array{Float64,1}
P1::SparseMatrixCSC{Float64,Int64}
P2::SparseMatrixCSC{Float64,Int64}
Δt::SparseMatrixCSC{Float64,Int64}
nl1::Array{Float64,1}
nl2::Array{Float64,1}
P1a::Array{Float64,1}
P1b::Array{Float64,1}
P2a::Array{Float64,1}
P2b::Array{Float64,1}
eigInf(q,λ1,λ2,P1,P2,Δt,nl1,nl2,P1a,P1b,P2a,P2b) = new(q,λ1,λ2,P1,P2,Δt,nl1,nl2,P1a,P1b,P2a,P2b)
end
Then, I propose a new function as follows
function myfun4(iObj::temIntStr,
pObj::temPro,
rObj::eigInf)
eA = iObj.valSwaA[:,1]
eB = iObj.valSwaB[:,1]
fA = iObj.valSwaA[:,2]
fB = iObj.valSwaB[:,2]
# identifying two vectors br and bc
idxR = view(eA,:).==view(eB,:)
idxC = view(fA,:).==view(fB,:)
iObj.br[idxR] .= 0
iObj.bc[idxC] .= 0
# TODO: these following operations seem to be quite expensive.
# This might be something related to your suggestion 1. I am not quite sure whether this be
# improved futher since I also don't have much experience with SparseMatrices. Perhaps, we could
# write a loop to perfom these commands to avoid reallocations here?
iObj.Wr .= iObj.Nr[:,eB] .- iObj.Nr[:,eA]
iObj.Wc .= iObj.Nc[:,fB] .- iObj.Nc[:,fA]
# linear indices
tAIdx = iObj.tA .+ iObj.v .* (iObj.nq .-1)
tBIdx = iObj.tB .+ iObj.v .* (iObj.nq .-1)
tABIdx = ApplyArray(vcat,tAIdx,tBIdx)
pObj.wra .= view(iObj.Wr,tAIdx)
pObj.wrb .= view(iObj.Wr,tBIdx)
pObj.wca .= view(iObj.Wc,tAIdx)
pObj.wcb .= view(iObj.Wc,tBIdx)
iObj.Wr[tABIdx].= 0
iObj.Wc[tABIdx].= 0
# TODO: can this be improved futher?
pObj.Δ1 .= pObj.wra .- pObj.wrb .+ iObj.br .+ pObj.wca .- pObj.wcb .+ iObj.bc
pObj.Δ2 .= pObj.wra .+ pObj.wrb .+ pObj.wca .+ pObj.wcb
rObj.Δt .= iObj.Wr .+ iObj.Wc
# Here I used your suggestion 4 to avoid allocations.
mulSparse(pObj.χ,rObj.Δt,pObj.Δ1,pObj.Δ2)
rObj.λ1 .= pObj.Δ1 .+ pObj.χ
rObj.λ2 .= pObj.Δ1 .- pObj.χ
lmul!(2.0,rObj.Δt)
# normalize vectors here
# Here I used your suggestion 3 to avoid allocations.
mulVec(rObj.nl1,rObj.λ1,pObj.χ)
mulVec(rObj.nl2,rObj.λ2,pObj.χ,true)
rObj.P1 .= rObj.Δt
rObj.P2 .= rObj.Δt
rObj.P1a .= pObj.Δ2 .+ rObj.λ1
rObj.P1b .= pObj.Δ2 .- rObj.λ1
rObj.P2a .= pObj.Δ2 .+ rObj.λ2
rObj.P2b .= pObj.Δ2 .- rObj.λ2
rObj.P1[tABIdx] = ApplyArray(vcat,rObj.P1a,rObj.P1b)
rObj.P2[tABIdx] = ApplyArray(vcat,rObj.P2a,rObj.P2b)
return nothing
end
function mulSparse(y::Array{Float64,1},
A::SparseMatrixCSC{Float64,Int64},
b1::Array{Float64,1},
b2::Array{Float64,1})
#rowval = A.rowval
nzval = A.nzval
fill!(y,0.0)
@inbounds for i ∈ eachindex(y)
for k=A.colptr[i]:(A.colptr[i+1]-1)
#ki=rowval[k] # index
#kv=nzval[k] # values
y[i] += 2*abs2(nzval[k])
end
y[i] += abs2(b1[i]) + abs2(b2[i])
y[i] = sqrt(y[i])
end
end
function mulVec(y::Array{Float64,1},
a1::Array{Float64,1},
a2::Array{Float64,1},sign=false)
if ~sign
@inbounds @simd for i ∈ eachindex(y)
y[i] = 2*sqrt(a1[i]*a2[i])
end
else
@inbounds @simd for i ∈ eachindex(y)
y[i] = 2*sqrt(-a1[i]*a2[i])
end
end
end
In order to run myfun4, you could perform the following commands:
v = 580;
q = 43416;
r = 18;
c = 36;
Nr = (1/c).*sprandn(v, r, 0.20);
Nc = (1/r).*sprandn(v, c, 0.20);
valSwaA = [rand(1:r,q) rand(1:c,q)];
valSwaB = [rand(1:r,q) rand(1:c,q)];
Wr = spzeros(Float64,v,q);
Wc = spzeros(Float64,v,q);
br = fill((1/c)*2,q);
bc = fill((1/r)*2,q);
nq = collect(1:q);
tA = rand(1:v,q);
tB = rand(1:v,q);
const iObj = temIntStr(nq, v, r, c,
Nr, Nc,
valSwaA, valSwaB,
tA, tB,
Wr, Wc,
br, bc);
# proccessed object:
const pObj = temPro(zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q));
# output:
const rObj = eigInf(q,
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,v,q),
zeros(Float64,v,q),
zeros(Float64,v,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q),
zeros(Float64,q));
@btime myfun4($iObj, $pObj, $rObj);
Now, I used BenchmarkTools to obtain computational times for myfun3 and myfun4. Here they are:
myfun3: 2.187 s (347540 allocations: 2.69 GiB)
myfun4: 945.036 ms (173775 allocations: 764.86 MiB)
As you can see, the performance of myfun4 is much better than the performance of myfun3. But it seems there are still a lot of allocations with myfun4. Do you think it is possible to improve this further? Thank you very much in advance! Also, thank you for showing the two reference links. Indeed, I have checked those links!