Using LoopVectorization with StructArrays

Hello,

I am using LoopVectorization combined with StructArrays to vectorize some code that used structures. I have created a simple example of what I am trying to achive using the struct Point2D.

using StructArrays, LoopVectorization, BenchmarkTools

struct Point2D
    x::Float64
    y::Float64
end

struct FirstWay end
struct SecondWay end

@inline function get_new_point(::Type{FirstWay}, p1, p2)
    Point2D(p1.x + p2.x, p1.y + p2.y)
end

@inline function get_new_point(::Type{SecondWay}, p1, p2)
    Point2D(p1.x - p2.x, p1.y - p2.y)
end

# Function working with @turbo but specific to FirstWay
function structarray_turbo_get_all_points(::Type{FirstWay}, p1, p2)
    length(p1) == length(p2) || error()
    x_out = Vector{Float64}(undef, length(p1))
    y_out = Vector{Float64}(undef, length(p1))
    out = StructArray{Point2D}((x_out, y_out))
    @turbo for i in eachindex(p1)
        out.x[i] = p2.x[i] + p1.x[i]
        out.y[i] = p2.y[i] + p1.y[i]
    end
    out
end

# Function working with @turbo but specific to SecondWay
function structarray_turbo_get_all_points(::Type{SecondWay}, p1, p2)
    length(p1) == length(p2) || error()
    x_out = Vector{Float64}(undef, length(p1))
    y_out = Vector{Float64}(undef, length(p1))
    out = StructArray{Point2D}((x_out, y_out))
    @turbo for i in eachindex(p1)
        out.x[i] = p1.x[i] - p2.x[i]
        out.y[i] = p1.y[i] - p2.y[i]
    end
    out
end

# I would like my function to be formated like this for clarity and to avoid repetions
function aim_function_noturbo(T, p1, p2)
    length(p1) == length(p2) || error()
    x_out = Vector{Float64}(undef, length(p1))
    y_out = Vector{Float64}(undef, length(p1))
    out = StructArray{Point2D}((x_out, y_out))
    @inbounds for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end

# Function added to compare time without struct arrays
function no_struct(T, p1, p2)
    length(p1) == length(p2) || error()
    out = Vector{Point2D}(undef, length(p1))
    @inbounds for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end

p1 = StructArray{Point2D}((rand(100000), rand(100000)))
p2 = StructArray{Point2D}((rand(100000), rand(100000)))
p1_nostruct = Point2D.(rand(100000), rand(100000))
p2_nostruct = Point2D.(rand(100000), rand(100000))

@btime no_struct(FirstWay, p1_nostruct, p2_nostruct) # 305.97 µs 
@btime structarray_turbo_get_all_points(FirstWay, p1, p2) # 130.248 µs
@btime structarray_turbo_get_all_points(SecondWay, p1, p2) # 132.144 µs
@btime aim_function_noturbo(SecondWay, p1, p2) # 406.816

The code aims to compute the get_new_point (which is dependent on an external struct) function for a vector of Points2D using LoopVectorization. Ideally, I would like my vectorised code to be something like:

function aim_function(T, p1, p2)
    length(p1) == length(p2) || error()
    x_out = Vector{Float64}(undef, length(p1))
    y_out = Vector{Float64}(undef, length(p1))
    out = StructArray{Point2D}((x_out, y_out))
    @turbo for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end
@btime aim_function_noturbo(SecondWay, p1, p2) # 425 µs

However, @turbo cannot vectorise this code. My question is: If possible, what would be the best way for LoopVectorization to vectorise a function formatted like this?

Thanks

1 Like

There is a typo in structarray_turbo_get_all_points where the loop index should be named i instead of b.

Below are the timings I get when running your code with the following setup

julia> versioninfo()
Julia Version 1.9.0-rc2
Commit 72aec423c2a (2023-04-01 10:41 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, tigerlake)
  Threads: 1 on 8 virtual cores
Environment:
  LD_PRELOAD = /usr/lib64/libstdc++.so.6
  JULIA_IMAGE_THREADS = 1

julia> Pkg.status()
Status `/tmp/jl_cBg8Ss/Project.toml`
  [bdcacae8] LoopVectorization v0.12.158
  [09ab397b] StructArrays v0.6.15

Note that I am running Julia v1.9.0 which is not released yet.

The timings are

  127.464 μs (2 allocations: 1.53 MiB) # no_struct(FirstWay, p1_nostruct, p2_nostruct)
  120.529 μs (5 allocations: 1.53 MiB) # structarray_turbo_get_all_points(FirstWay, p1, p2)
  129.710 μs (5 allocations: 1.53 MiB) # structarray_turbo_get_all_points(SecondWay, p1, p2)
  140.311 μs (5 allocations: 1.53 MiB) # aim_function_noturbo(SecondWay, p1, p2)
  137.983 μs (5 allocations: 1.53 MiB) # aim_function(SecondWay, p1, p2)

I can see a drastic speed up of no_struct, aim_function_noturbo and aim_function compared to your benchmarks. However, for aim_function I received the warning

 Warning: #= /tmp/mwe.jl:67 =#:
│ `LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.
│ Use `warn_check_args=false`, e.g. `@turbo warn_check_args=false ...`, to disable this warning.
└ @ Main ~/.julia/packages/LoopVectorization/IkdFM/src/condense_loopset.jl:1148

which explains why it is as fast as aim_function_noturbo, I guess.

FFIW, I looked at @code_warntype aim_function(SecondWay, p1, p2) and I find that there is a type instability introduced by logger::Union{Nothing, Base.CoreLogging.AbstractLogger}. Not sure if that is related to the above warning and whether eliminating it would speed it up further. Perhaps someone else more experienced could help.

2 Likes

Thanks! I have now similar timings as you. Some time ago I switch the optimisation flag to 1 and forgot to put it back to 3.

I have updated the example but added a vectorisable time consuming function inside get_new_point to have some benefits in using the @turbo

using StructArrays, LoopVectorization, BenchmarkTools

struct Point2D{T}
    x::T
    y::T
end

struct FirstWay end
struct SecondWay end

# Some time consumming function
f(x) = cos(sin(exp(cos(sin(exp(cos(x)^2)^3)^4)^2)^1 / 10)^2)

@inline function get_new_point(::Type{FirstWay}, p1, p2)
    Point2D(f(p1.x + p2.x), f(p1.y + p2.y))
end

@inline function get_new_point(::Type{SecondWay}, p1, p2)
    Point2D(f(p1.x - p2.x), f(p1.y - p2.y))
end

function structarray_turbo_get_all_points!(::Type{FirstWay}, out, p1, p2)
    length(p1) == length(p2) == length(out) || error()
    @turbo for i in eachindex(p1)
        out.x[i] = f(p2.x[i] + p1.x[i])
        out.y[i] = f(p2.y[i] + p1.y[i])
    end
    out
end

function structarray_turbo_get_all_points!(::Type{SecondWay}, out, p1, p2)
    length(p1) == length(p2) == length(out) || error()
    @turbo for i in eachindex(p1)
        out.x[i] = f(p1.x[i] - p2.x[i])
        out.y[i] = f(p1.y[i] - p2.y[i])
    end
    out
end

function aim_function_noturbo!(T, out, p1, p2)
    length(p1) == length(p2) == length(out)|| error()
    @inbounds for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end

function no_struct!(T, out, p1, p2)
    length(p1) == length(p2) == length(out) || error()
    @inbounds for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end

function aim_function!(T, out, p1, p2)
    length(p1) == length(p2) == length(out) || error()
    @turbo for i in eachindex(p1)
        out[i] = get_new_point(T, p1[i], p2[i])
    end
    out
end
l = 100000
T = Float64
p1 = StructArray{Point2D{T}}((rand(l), rand(l)))
p2 = StructArray{Point2D{T}}((rand(l), rand(l)))
out = StructArray{Point2D{T}}((rand(l), rand(l)))
p1_nostruct = collect(p1)
p2_nostruct = collect(p2)
out_nostruct = collect(out)

@btime a = no_struct!(FirstWay, out_nostruct, p1_nostruct, p2_nostruct); # 12.514 ms
@btime b = structarray_turbo_get_all_points!(FirstWay, copy(out), p1, p2); # 2.132 ms
@btime c = structarray_turbo_get_all_points!(SecondWay, copy(out), p1, p2); # 2.132 ms
@btime d = aim_function_noturbo!(SecondWay, copy(out), p1, p2); # 12.247 ms
@btime e = aim_function!(SecondWay, copy(out), p1, p2); # 12.013 ms
versioninfo()
Julia Version 1.8.3
Commit 0434deb161e (2022-11-14 20:14 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, tigerlake)
  Threads: 4 on 8 virtual cores
Environment:
  JULIA_EDITOR = code
  JULIA_NUM_THREADS = 4

Just a side note, you can subtype FieldVector from StaticArrays and get all the algebra on your custom points automatically.