Just out of curiosity, has anyone written up a sparse matrix-vector product using the new multithreading stuff? The At_mul_B operation works great because all the writes are disjoint between threads. But I was curious about the best strategy to implement the A_mul_B operation.
The code I’ve gotten at the moment is: (Which is very rough, just wanted something to start with!)
using Iterators
#simple_partition(n::Int, k::Int) = push!(map(first, partition(1:n,floor(Int,n/k))),n+1)
function simple_partition(n::Int,k::Int)
  rval = ones(Int,k+1)
  m,r = divrem(n,k)
  fill!(rval, m)
  rval[1] = 1
  for i = 2:k+1
    rval[i] += r .> (i-2)
  end
  cumsum!(rval,rval)
end
import Base.LinAlg.At_mul_B, Base.LinAlg.At_mul_B!, Base.A_mul_B, Base.LinAlg.A_mul_B!
"""
A type to encapsulate the multithreaded matrix-vector product operation.
Usage
-----
~~~~
A = sprandn(10^5,10^5,100/10^5); # create a fairly dense sparse matrix.
x = randn(10^5);
M = MultithreadedMatVec(A, simple_partition(A.n, Threads.nthreads()));
y = M.'*x;
norm(y - A.'*x)
~~~~
"""
mutable struct MultithreadedMatVec{T,I} <: AbstractArray{T,2}
  A::SparseMatrixCSC{T,I}
  regions::Vector{Int}
  MultithreadedMatVec(A::SparseMatrixCSC) = MultithreadedMatVec(A,Threads.nthreads())
  MultithreadedMatVec(A::SparseMatrixCSC, k::Int) = MultithreadedMatVec(A, simple_partition(A.n,k))
  function MultithreadedMatVec(A::SparseMatrixCSC{T,I}, regions::Vector{Int}) where {T,I}
    new{T,I}(A, regions)
  end
end
import Base.size
size(M::MultithreadedMatVec,inputs...) = size(M.A,inputs...)
# Julia's built in multiplication operations are called with
# A_mul_B!
# Ac_mul_B!
# At_mul_B!
# which take in
# α::Number, A::SparseMatrixCSC, B::StridedVecOrMat, β::Number, C::StridedVecOrMat
# and compute
# βC += αA B
# βC += αA^* B
# βC += αA^T B
# respectively
# look in base/sparse/linalg.jl
# for their implementations
""" Run the internal loop """
function internal_loop_transmult(C,B,nzv,rv,colptr,i1,i2,α::Number)
  for k=1:size(C,2)
    @inbounds for col=i1:i2
      tmp = zero(eltype(C))
      for j=colptr[col]:(colptr[col+1]-1)
        tmp += nzv[j]*B[rv[j],k]
      end
      C[col,k] += α*tmp
    end
  end
  return
end
"""
we are going to make these work with MuilthreadedMatVec types
"""
function At_mul_B!(α::Number, M::MultithreadedMatVec, B::StridedVecOrMat, β::Number, C::StridedVecOrMat)
  M.A.m == size(B,1) || throw(DimensionMismatch())
  M.A.n == size(C,1) || throw(DimensionMismatch())
  size(B,2) == size(C,2) || throw(DimensionMismatch())
  nzv = M.A.nzval
  rv = M.A.rowval
  colptr = M.A.colptr
  if β != 1
    β != 0 ? scale!(C, β) : fill!(C, zero(eltype(C)))
  end
  # this is the parallel construction
  Threads.@threads for t=1:(length(M.regions)-1)
    internal_loop_transmult(C,B,nzv,rv,colptr,M.regions[t],M.regions[t+1]-1,α)
  end
  C
end
function At_mul_B(M::MultithreadedMatVec{TA,S}, x::StridedVector{Tx}) where {TA,S,Tx}
  T = promote_type(TA,Tx)
  At_mul_B!(one(T), M, x, zero(T), similar(x, T, M.A.n))
end
function internal_loop_mult(C,B,nzv,rv,colptr,i1,i2,α::Number)
  for k=1:size(C,2)
    @inbounds for col=i1:i2
      αxj = α*B[col,k]
      for j=colptr[col]:(colptr[col+1]-1)
        # need to be done atomically... (ideally...)
        Threads.@atomic C[rv[j],k] += nzv[j]*αxj
      end
    end
  end
  return
end
function A_mul_B!(α::Number, M::MultithreadedMatVec, B::StridedVecOrMat, β::Number, C::StridedVecOrMat)
  M.A.n == size(B,1) || throw(DimensionMismatch())
  M.A.m == size(C,1) || throw(DimensionMismatch())
  size(B,2) == size(C,2) || throw(DimensionMismatch())
  nzv = M.A.nzval
  rv = M.A.rowval
  colptr = M.A.colptr
  if β != 1
    β != 0 ? scale!(C, β) : fill!(C, zero(eltype(C)))
  end
  # this is the parallel construction
  Threads.@threads for t=1:(length(M.regions)-1)
    internal_loop_mult(C,B,nzv,rv,colptr,M.regions[t],M.regions[t+1]-1,α)
  end
  C
end
import Base.eltype
eltype(M::MultithreadedMatVec{T,I}) where {T,I} = T
function A_mul_B!(C::StridedVecOrMat, M::MultithreadedMatVec, B::StridedVecOrMat)
  T = promote_type(eltype(M), eltype(B))
  A_mul_B!(one(T), M, B, zero(T), C)
end
The A_mul_B doesn’t work because the “Threads.@atomic” macro doesn’t exist. Removing this would run, but likely would give the wrong answer depending on your multithreading environment.
There’s the whole Threads.atomic* set of operations, which could be adapted for this task in various ways. I was just curious if anyone had actually worked this out in Julia and looked at any performance results. (I’m sure this has been done before many times in other languages :))