Views performance in Julia 1.0-DEV?

The following function runs 1.5x slower in Julia latest 1.0-DEV, compared to Julia 0.6. The memory allocation is the same but @code_llvm shows very mysterious code in 1.0-DEV. Any idea what is going on?

using BenchmarkTools

function gram_schmidt1!(U::Matrix{T}) where T <: Real 
    m = size(U, 2)
    for k = 1:m
        uk = view(U,:,k)
        for j = 1:k-1
            @views uk .-= (U[:,j] ⋅ uk) .* U[:,j]
        end
        uk ./= norm(uk)
    end
end

function ttt()
  N = 100
  A = randn(N, N)
  gram_schmidt1!(A)
  s = sum(A)
end 

Timing in 0.6:

julia> @btime ttt()
  397.717 μs (10002 allocations: 546.95 KiB)
-6.676360719294487

and timing in 1.0-DEV:

julia> @btime ttt()
  587.851 μs (10002 allocations: 546.95 KiB)
-7.642644885020053

I’m not sure what the problem with views are, but as suggested by kristoffer.carlsson, consider using uviews:

julia> using LinearAlgebra #modify gram_schmidt1 to accept AbstractMatrices

julia> @benchmark ttt()
BenchmarkTools.Trial: 
  memory estimate:  546.95 KiB
  allocs estimate:  10002
  --------------
  minimum time:     1.447 ms (0.00% GC)
  median time:      1.503 ms (0.00% GC)
  mean time:        1.709 ms (4.81% GC)
  maximum time:     82.099 ms (97.87% GC)
  --------------
  samples:          2910
  evals/sample:     1

julia> using UnsafeArrays

julia> Base.mightalias(::UnsafeArray, ::UnsafeArray) = false

julia> function tttu()
           N = 100
           A = randn(N, N)
           @uviews A begin
               gram_schmidt1!(A)   
           end
           s = sum(A)
       end 

julia> @benchmark tttu()
BenchmarkTools.Trial: 
  memory estimate:  78.20 KiB
  allocs estimate:  2
  --------------
  minimum time:     623.537 μs (0.00% GC)
  median time:      680.178 μs (0.00% GC)
  mean time:        873.065 μs (3.19% GC)
  maximum time:     123.324 ms (99.07% GC)
  --------------
  samples:          5659
  evals/sample:     1

If you replace the dot and norm functions with ones written in Julia:

@inline function jdot(x,y)
    @boundscheck length(x) == length(y) || throw(BoundsError())
    out = zero(promote_type(eltype(x),eltype(y)))
    @inbounds @simd for i = eachindex(x)
        out += x[i] * y[i]
    end
    out
end
@inline function jnorm(x)
    out = zero(eltype(x))
    @inbounds @simd for i = eachindex(x)
        out += x[i] * x[i]
    end
    sqrt(out)
end
function gram_schmidt2!(U::AbstractMatrix{T}) where T <: Real 
    m = size(U, 2)
    @inbounds for k = 1:m
        uk = view(U,:,k)
        for j = 1:k-1
            @views uk .-= jdot(U[:,j], uk) .* U[:,j]
        end
        uk ./= jnorm(uk)
    end
end
function tttu2()
    N = 100
    A = randn(N, N)
    @uviews A begin
        gram_schmidt2!(A)   
    end
    s = sum(A)
end 

This gives us:

julia> @benchmark tttu2()
BenchmarkTools.Trial: 
  memory estimate:  78.20 KiB
  allocs estimate:  2
  --------------
  minimum time:     556.635 μs (0.00% GC)
  median time:      570.833 μs (0.00% GC)
  mean time:        608.846 μs (2.35% GC)
  maximum time:     77.488 ms (99.14% GC)
  --------------
  samples:          8155
  evals/sample:     1

about 2.5x better than when we started, and better than I get doing the same thing in Julia 0.6.