I do not think you understood the predicate I use and the structure of the matrix: For each line of the matrix, every non-zero-value is the same value. Hence the matrix-vector product fatorizes. The number of multiplications is reduced, but, you are right, the indexing is still the same.

Here is a (quickly-extracted so not that clean) MWE:

```
using SparseArrays
struct SparseLinearConstraint{T}
A::Matrix{T}
b::Vector{T}
P::SparseMatrixCSC{T, Int64}
q::SparseVector{T, Int64}
rez::SparseVector{T, Int64}
end
function SparseLinearConstraint(A::Matrix{T},b::Vector{T}) where T
AiAA = A'inv(A*A')
P = sparse(-AiAA*A)
q = sparse(+AiAA*b)
return SparseLinearConstraint(A,b,P,q,spzeros(eltype(q),length(q)))
end
@inline function proj!(x,X::SparseLinearConstraint)
X.rez .= x + X.P*x + X.q ##### This lines takes 70% of runtime, due to the sparse matrix-vector product.
end
struct SparsePositivityConstraint{T}
rez::SparseVector{T, Int64}
tol::T
end
function SparsePositivityConstraint(x)
return SparsePositivityConstraint(spzeros(eltype(x),length(x)),sqrt(eps(eltype(x))))
end
@inline function proj!(x,X::SparsePositivityConstraint)
X.rez .= x
SparseArrays.fkeep!(X.rez, (i,x) -> x>X.tol)
end
function Dykstra!(x,ctrs)
old_x = zero(x)
err = sqrt(eps(eltype(x)))
sn = [zero(x) for _ in ctrs]
while !all(abs.(old_x .-x) .< err)
old_x .= x
for (i,ctr) in enumerate(ctrs)
proj!(x-sn[i],ctr)
sn[i] += ctr.rez - x
x .= ctr.rez
end
end
return nothing
end
# A simple test case:
using Random
function linearize(a::Vector{T},b::Vector{T}) where T
n1 = length(a)
n2 = length(b)
p = (n1+1)*(n2+1)
eq_A = zeros(T,(n1+n2,p-1))
eq_b = [a...,b...]
for i in 0:n2
for j in 1:n1
eq_A[j,j+i*(1+n1)] = 1
end
end
for i in 1:n2
eq_A[n1+i,(1:(n1+1)).+n1.+(i-1)*(n1+1)] .= 1
end
return eq_A,eq_b
end
Solve_Type = Float64
Random.seed!(123)
n,m = 30,30;
a1 = rand(n);
a2 = rand(m);
x = Solve_Type.(randn((n+1)*(m+1)-1));
spx = deepcopy(x);
spx = sparse(spx);
A,b= linearize(a1,a2);
constraints = [SparseLinearConstraint(A[1:n,:],a1),
SparseLinearConstraint(A[n+1:end,:],a2),
SparsePositivityConstraint(spx)];
# Some tests:
spx = deepcopy(x);
@time Dykstra!(spx,constraints)
spx = deepcopy(x);
@time Dykstra!(spx,constraints)
spx = deepcopy(x);
spx = sparse(spx)
@profview Dykstra!(spx,constraints)
```

The main algorithm is in the `Dykstra!`

function, with a majority of the runtime in the `proj!(x,X::SparseLinearConstraint)`

function.

In this function, profview reports `SparseArrays.mul!`

to be the most time-consuming function.

EDIT: The upper code has been edited once or twice. I have now a working version of what I want, which writes as :

```
using SparseArrays
n,m = 1000,1000
x = rand(m)
P = sparse((rand(n,m).>0.97) .* rand(m));
# check predicate:
all([sum(unique(P[i,:]).!=0)<=1 for i in 1:size(P,1)])
# extract the b vector and idx vector:
b = [sum(unique(P[i,:])) for i in 1:size(P,1)]
M = sparse(BitMatrix(P .!= 0))
idx = [P[i,:].nzind for i in 1:n]
rez = similar(b)
@inline my_prod!(b,M,x) = b.* (M*x)
rez = my_prod!(b,M,x)
rez ≈ P*x
# test :
using BenchmarkTools
@btime P*$(x);
@btime my_prod!(b,M,$x);
```

Which now has the same runtime as the full sparse product, but still allocates a little too much. I’m sure we can do better.