Micro benchmarks of kron for vectors

Edit: corrected according to @jling `s comment

Inspired by another thread here is a micro benchmark of Base.kron for Vector

using LinearAlgebra
using BenchmarkTools

function _kron(A::Matrix, B::Matrix)
    C = zeros(size(A,1) * size(B,1), size(A,2) * size(B,2))
    for i = 1:size(A, 1)
        for j = 1:size(A, 2)
            @views C[(i-1)*size(B,1)+1:i*size(B,1),(j-1)*size(B,2)+1:j*size(B,2)] .= A[i,j] * B
        end
    end
    C
end

function _kron!(C::Matrix, A::Matrix, B::Matrix)
    for i = 1:size(A, 1)
        for j = 1:size(A, 2)
            @views mul!(C[(i-1)*size(B,1)+1:i*size(B,1),(j-1)*size(B,2)+1:j*size(B,2)], A[i,j], B)
        end
    end
    C
end

function _kron(A::Vector, B::Vector)
    C = zeros(size(A,1) * size(B,1))
    for i = 1:size(A, 1)
        @views C[(i-1)*size(B,1)+1:i*size(B,1)] .= A[i] * B
    end
    C
end

function _kron!(C::Vector, A::Vector, B::Vector)
    for i = 1:size(A, 1)
        @views mul!(C[(i-1)*size(B,1)+1:i*size(B,1)], A[i], B)
    end
    C
end

A = Matrix([[1.0, 2.0] [3.0, 4.0]])
B = Matrix([[1.0, 2.0] [3.0, 4.0]])

C1 = _kron(A, B)
C2 = kron(A, B)
@assert C1 ≈ C2

C1 = Matrix{Float64}(undef, 4, 4)
_kron!(C1, A, B)
C2 = Matrix{Float64}(undef, 4, 4)
kron!(C2, A, B)
@assert C1 ≈ C2

@btime _kron($A, $B)
@btime kron($A, $B)

C = Matrix{Float64}(undef, 4, 4)
@btime _kron!($C, $A, $B)
@btime kron!($C, $A, $B)

A = Vector([1.0, 2.0])
B = Vector([1.0, 2.0])

C1 = _kron(A, B)
C2 = kron(A, B)
@assert C1 ≈ C2

C1 = Vector{Float64}(undef, 4)
_kron!(C1, A, B)
C2 = Vector{Float64}(undef, 4)
kron!(C2, A, B)
@assert C1 ≈ C2

@btime _kron($A, $B)
@btime kron($A, $B)

C = Vector{Float64}(undef, 4)
@btime _kron!($C, $A, $B)
@btime kron!($C, $A, $B)

with

  273.597 ns (5 allocations: 656 bytes)
  49.034 ns (1 allocation: 208 bytes)
  67.695 ns (0 allocations: 0 bytes)
  25.201 ns (0 allocations: 0 bytes)
  108.141 ns (3 allocations: 304 bytes)
  128.485 ns (7 allocations: 384 bytes)
  20.783 ns (0 allocations: 0 bytes)
  108.172 ns (6 allocations: 288 bytes)

Is this a defect?

use SVector for your kernel A and B

Vectors are quite big in the original example and please notice the runtime and additional allocations in the vector case in contrast to the matrix case.

isn’t it just in-place vs. not?

No, it tries to compare both (inplace and allocated) of Base.kron to naive versions.

I think there’s no implementation for kron of vectors, it just reshapes to use the one for matrices. And there’s some overhead to that which, for very very small arrays, becomes the dominant time.

julia> size©  # vector case
(4,)

julia> @btime _kron!($C, $A, $B);
  min 11.136 ns, mean 11.307 ns (0 allocations)

julia> @btime kron!($C, $A, $B);
  min 71.422 ns, mean 93.837 ns (6 allocations, 288 bytes. GC mean 21.04%)

julia> @less kron!(C, A, B);  # this reshapes to make matrices

julia> @btime reshape(reshape($C,2,2),4);  # in fact it needs 3 reshapes, not 2
  min 43.939 ns, mean 59.453 ns (4 allocations, 176 bytes. GC mean 22.68%)
2 Likes

Thanks for the explanation!