Octavian Matmul slow in for loop

Hey. I want to use octavian matmul in a foor loop. However, the code gets around 10 times slower in the loop.

using BenchmarkTools
using LinearAlgebra
using LoopVectorization
struct cust_Dense
    W :: Array{Float32,2}
    b :: Array{Float32,1}
end

function cust_Dense(in::Integer, out::Integer)
    return cust_Dense(rand(out, in), rand(out))
end
@inline function (c::cust_Dense)(x :: Array{Float32}) 
    x :: Array{Float32} = Octavian.matmul(c.W, x) 
    #x :: Array{Float32} = *(c.W, x) 
    @inbounds x = x .+ c.b
    return 
end

function my_loop(layer,inputs)
    for _ in 1:1000
        layer(inputs)
    end
end
function main()

    layer_self = cust_Dense(3,100) |> cpu
    layer = Dense(3,100) |> cpu



    inputs = rand(3,2000) .|> Float32 |> cpu

    @btime my_loop($layer_self,$inputs)
    @btime layer_self($inputs)

    inputs
end

ins = main()

The Benchmark results:
1.667 s (7063 allocations: 1.49 GiB) for the 1000 loop calls
133.100 μs (7 allocations: 1.53 MiB) for 1 call

If I use the build in matrix multiplication the problem does not appear. But a single call is much slower this way.

Thanks for your time.

1 Like