A while ago I found a curious performance regression in a very simple change to the sparse mat-vec product. Today I tried the same on Julia 0.7 and the results were exactly the opposite*:

```
> bench() # Julia 0.6.2
(Trial(634.698 μs), Trial(1.479 ms))
> bench() # Julia 0.7 latest nightly
(Trial(1.290 ms), Trial(683.206 μs))
```

The benchmark is as follows:

```
using BenchmarkTools
import Base: start, next, done
struct HalfOpenRange{T}
lo::T
hi::T
end
start(r::HalfOpenRange) = r.lo
done(r::HalfOpenRange, s) = s == r.hi
next(r::HalfOpenRange{T}, s) where {T} = (s, s + T(1))
function new_mul!(y::AbstractVector{Tv}, A::SparseMatrixCSC{Tv,Ti}, x::AbstractVector{Tv}) where {Tv,Ti}
@inbounds for i = 1 : A.n
xval = x[i]
for j = HalfOpenRange(A.colptr[i], A.colptr[i+1])
y[A.rowval[j]] += A.nzval[j] * xval
end
end
y
end
function curr_mul!(y::AbstractVector{Tv}, A::SparseMatrixCSC{Tv,Ti}, x::AbstractVector{Tv}) where {Tv,Ti}
@inbounds for i = 1 : A.n
xval = x[i]
for j = A.colptr[i] : A.colptr[i+1] - 1
y[A.rowval[j]] += A.nzval[j] * xval
end
end
y
end
function bench(n = 100_000)
A = spdiagm((fill(-1.0, n - 1), fill(2.0, n), fill(-1.2, n - 1)), (-1, 0, 1))
x = rand(n)
bench_new = @benchmark new_mul!(y, $A, $x) setup = (y = zeros($x))
bench_curr = @benchmark curr_mul!(y, $A, $x) setup = (y = zeros($x))
bench_new, bench_curr
end
```

Now the weird thing is that if I diff the result of `@code_native`

on Julia 0.7, it seems like the slower variant has only a subset of the assembly operations of the faster variant. Below is the only significant diff:

```
@@ -16,14 +16,9 @@
nopw %cs:(%rax,%rax)
L64:
movq -8(%r11,%r15,8), %rbx
- movq (%r11,%r15,8), %rcx
- addq $-1, %rcx
- leaq -1(%rbx), %rdx
- cmpq %rcx, %rbx
- cmovleq %rcx, %rdx
- addq $1, %rdx
+ movq (%r11,%r15,8), %rdx
cmpq %rdx, %rbx
- je L177
+ je L161
vmovsd -8(%r8,%r15,8), %xmm0 ## xmm0 = mem[0],zero
movq 24(%r10), %rcx
movq 32(%r10), %rdi
```

This is the part where `A.colptr[i]`

and `A.colptr[i+1]`

are being dereferenced. The fast version (in red) performs the `-1`

, while the slow version (in green) doesn’t need to do it and has fewer instructions.

So where could the regression come from? I’m puzzled.

*Benchmark was run on a three years old Macbook Air.