Hey. I want to use octavian matmul in a foor loop. However, the code gets around 10 times slower in the loop.
using BenchmarkTools
using LinearAlgebra
using LoopVectorization
struct cust_Dense
W :: Array{Float32,2}
b :: Array{Float32,1}
end
function cust_Dense(in::Integer, out::Integer)
return cust_Dense(rand(out, in), rand(out))
end
@inline function (c::cust_Dense)(x :: Array{Float32})
x :: Array{Float32} = Octavian.matmul(c.W, x)
#x :: Array{Float32} = *(c.W, x)
@inbounds x = x .+ c.b
return
end
function my_loop(layer,inputs)
for _ in 1:1000
layer(inputs)
end
end
function main()
layer_self = cust_Dense(3,100) |> cpu
layer = Dense(3,100) |> cpu
inputs = rand(3,2000) .|> Float32 |> cpu
@btime my_loop($layer_self,$inputs)
@btime layer_self($inputs)
inputs
end
ins = main()
The Benchmark results:
1.667 s (7063 allocations: 1.49 GiB) for the 1000 loop calls
133.100 μs (7 allocations: 1.53 MiB) for 1 call
If I use the build in matrix multiplication the problem does not appear. But a single call is much slower this way.
Thanks for your time.