I am trying to get multiplication of a lot of pretty big sparse matrices, I figured out that their special structure allows me to simplify the multiplication into just a computing rows and columns for resulting sparse matrix. The problem is that i need to do do a lot of iterations and sometimes it takes a lot of time because gc() runs (usualy about 95% of time in that iteration). I checked @code_warntype nad it seems to me that there is no problem in the loop that i need to make faster. I know that using view() often helps, but I am working with arrays of differnet sizes so they cannot be put into a matrix together.
Here is my code

function sparseIndexMatrix2(indMat, nx, px, o, nw, pw)
(n, p) = size(indMat)
if p == nx*px
u = o^2
else
u = nx*px
end
rows = Array{Int64, 1}[]
cols = Array{Int64, 1}[]
for i = 1 : nx*px
(ii, jj, ~) = findnz(indMat[(i-1)*u+1:i*u, :])
push!(rows, ii)
push!(cols, jj)
println(i)
end
LL = fill(Int64[], nx^2*px^2)
q = 0
getIndices26!(LL, q, rows, cols, nx, px, nw, pw)
J = zeros(q)
T = ones(q)
println(q)
c = 1
for l = 1 : length(LL)
if isempty(LL[l]) == false
J[c:c+length(LL[l])-1] .= l
c += length(LL[l])
end
end
L = vcat(LL...)
end
tildeV = sparse(J, L, T)
end
function getIndices26!(LL::Array{Array{Int64, 1}, 1}, q::Int64, rows, cols, nx, px, nw, pw)
for i = 1 : nx*px
i1 = rows[i]
j1 = cols[i]
for j = 1 : nx*px
inds = findall(in(rows[j]), i1)
if isempty(inds) == false
LL[(i-1)*nx*px + j] = j1[inds] + (cols[j][findall(in(i1), rows[j])].-1)*nw*pw
q += length(inds)
end
end
end
return nothing
end

nxpx and nwpw are 10^4 in the case I am struggling with. Any advice how to make getIndices26!() faster? Thanks in advance.

Youâ€™re allocating an array of indices here, then doing an operation for each index, then throwing away that array of indices. Instead, you could avoid the allocation entirely by adding another loop.

For example, letâ€™s say we wanted to check which numbers in 1:N are even (not a hard problem, but an easy example). Your code is doing something akin to:

function find_evens_1(N)
x = zeros(Bool, N)
# Allocate a vector of indices
inds = findall(iseven, 1:N)
# Use those indices all at once
x[inds] .= true
end

I am suggesting, instead, that you do:

function find_evens_2(N)
x = zeros(Bool, N)
for i in 1:N
if iseven(i)
x[i] = true
end
end
end

function getIndices26!(LL::Array{Array{Int64, 1}, 1}, q::Int64, rows, cols, nx, px, nw, pw)
for i = 1 : nx*px
i1 = rows[i]
for j = 1 : nx*px
i2 = rows[j]
for k = 1 : length(i1)
for n = 1 : length(i2)
if i2[n] == i1[k]
push!(LL[(i-1)*nx*px + j], cols[i][k] + (cols[j][n]-1)*nw*pw)
q += 1
end
end
end
end
end
return nothing
end

and although it was faster, I ran out of memory. There is still one thing I donâ€™t understand. I measure time spent at each â€śiâ€ť and most of the time, it did not show any allocation at all, but sometimes there appeared some. What could possibly cause the difference? I would expect that there are some differences because of different legths of arrays of indices in each iteration, but no allocation at all? It somehow does not make sense to me.

As for why you ran out of memory, Iâ€™m not sureâ€“the updated code should use less memory than the original. I would recommend checking carefully that the new and old code actually do the same thing.