While choosing a programming language for our next project wrote a short program in Julia:
using LinearAlgebra
using Statistics
using Random
import Base: +, -
struct Point
x::Float64
y::Float64
z::Float64
end
+(a::Point, b::Point) = Point(a.x + b.x, a.y + b.y, a.z + b.z)
-(a::Point, b::Point) = Point(a.x - b.x, a.y - b.y, a.z - b.z)
struct Plane
point::Point
normal::Array{Float64}
end
ArrayToMatrix(array) = vcat(array...)
function mean_center(points)
centroid = Point(mean(ArrayToMatrix([[p.x p.y p.z] for p in points]), dims = 1)...)
points_centered = [p - centroid for p in points]
centroid, points_centered
end
function best_fit_plane(points)
centroid, centered_points = mean_center(points)
m = ArrayToMatrix([[p.x p.y p.z] for p in centered_points])
u, _, _ = svd(transpose(m))
Plane(centroid, u[:,end])
end
function main()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 1000000
POINTS_PER_BATCH = 8
generatePoint() = map(x -> x + MIN, rand!(zeros(3)) * (MAX - MIN))
generateBatch() = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))
for b in generateBatches()
points = [Point(coord...) for coord in b]
println(best_fit_plane(points))
end
end
main()
running this code with following command:
time julia -O 3 -t auto -p auto bench.jl
and get on my mac book:
julia -p auto -t auto -O 3 bench.jl 210,16s user 112,23s system 30% cpu 17:43,32 total
Almost 18 mins. The same code in Python + skspatial finishes in 11 mins average. Haskell ~ 1 min.
There are many things to improve here. In addition to the abstract type in Plane, I believe the main issue is the repeated (and very slow) conversions of Points into matrix. You must find a way to not do that.
I’ll have a look later, unless someone beats me to it.
A bit less elegant perhaps but this already gives you a x2 speed-up
function mean_center2(points,centroid)
for p in points
centroid += [p.x; p.y; p.z]
end
centroid = Point(centroid./length(points)...)
points_centered = [p - centroid for p in points]
centroid, points_centered
end
function best_fit_plane2(points,centroid,m)
centroid, centered_points = mean_center2(points,centroid)
@inbounds for p ∈ 1:8
m[p,1],m[p,2],m[p,3] = centered_points[p].x,centered_points[p].y, centered_points[p].z
end
u, _, _ = svd(transpose(m))
Plane(centroid, u[:,end])
end
function main()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 10000
POINTS_PER_BATCH = 8
generatePoint() = map(x -> x + MIN, rand(3) * (MAX - MIN))
generateBatch() = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))
for b in generateBatches()
points = [Point(coord...) for coord in b]
# println(best_fit_plane(points))
best_fit_plane(points)
end
end
function main2()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 10000
POINTS_PER_BATCH = 8
m = fill(0.0,POINTS_PER_BATCH,3)
centroid = fill(0.0,3)
generatePoint() = Point( rand(3) * (MAX - MIN) .+ MIN...)
generateBatch() = map(x -> generatePoint(), zeros(POINTS_PER_BATCH))
generateBatches() = map(x -> generateBatch(), zeros(BATCHES_COUNT))
for b in generateBatches()
best_fit_plane2(b,centroid,m)
end
end
@benchmark main()
BenchmarkTools.Trial:
memory estimate: 107.57 MiB
allocs estimate: 1149496
--------------
minimum time: 196.844 ms (9.15% GC)
median time: 216.148 ms (12.00% GC)
mean time: 269.495 ms (23.74% GC)
maximum time: 955.288 ms (44.72% GC)
--------------
samples: 19
evals/sample: 1
@benchmark main2()
BenchmarkTools.Trial:
memory estimate: 89.10 MiB
allocs estimate: 949496
--------------
minimum time: 136.543 ms (3.96% GC)
median time: 138.251 ms (4.29% GC)
mean time: 139.093 ms (4.30% GC)
maximum time: 154.269 ms (3.36% GC)
--------------
samples: 36
evals/sample: 1
It seems printing to a terminal in Julia is pretty slow compared to Python, so that will be noticeable for lots of output to a terminal:
paulm@cmstorm 09:25:/data/examples/julia$ cat print_integers.py
for i in range(1, 1000001):
print(i)
paulm@cmstorm 09:25:/data/examples/julia$ cat print_integers.jl
for i in 1 : 1000000
println(i)
end
paulm@cmstorm 09:25:/data/examples/julia$ time python print_integers.py
...
999998
999999
1000000
real 0m2.440s
user 0m1.196s
sys 0m1.093s
paulm@cmstorm 09:25:/data/examples/julia$ time julia -O3 print_integers.jl
...
real 0m5.790s
user 0m2.909s
sys 0m2.863s
paulm@cmstorm 09:26:/data/examples/julia$ time python print_integers.py > /dev/null
real 0m0.277s
user 0m0.274s
sys 0m0.003s
paulm@cmstorm 09:27:/data/examples/julia$ time julia -O3 print_integers.jl > /dev/null
real 0m0.375s
user 0m0.289s
sys 0m0.071s
I would not use my own definition of Point. Instead I would just use StaticArrays. You can roll your own, like you have, but then you should add some more functionality.
You should absolutely avoid the super-expensive conversion from points to matrix, and I would not generate the whole set of batches at once, just generate one at a time.
Here’s an example implementation using SVector:
using StaticArrays
using Statistics: mean
using LinearAlgebra: svd
struct Plane
point::SVector{3, Float64}
normal::SVector{3, Float64}
end
points2matrix(points) = reshape(reinterpret(Float64, points), 3, :)
function mean_center(points)
centroid = mean(points)
points_centered = points .- (centroid,)
return centroid, points_centered
end
function best_fit_plane(points)
centroid, centered_points = mean_center(points)
M = points2matrix(centered_points)
u, = svd(M)
return Plane(centroid, u[:,end])
end
function main()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 1000000
POINTS_PER_BATCH = 8
generatePoint() = rand(SVector{3, Float64}) .* (MAX - MIN) .+ MIN
generateBatch() = [generatePoint() for _ in 1:POINTS_PER_BATCH]
generateBatches() = (generateBatch() for _ in 1:BATCHES_COUNT)
planes = [best_fit_plane(batch) for batch in generateBatches()]
return planes
end
While benchmarking I suggest reducing the BATCHES_COUNT which is needlessly large for benchmarking.
Notice the definition of generateBatches. It has () instead of []. This is a generator, so each batch is generated on demand.
Runtime for the above code is ~6.5 seconds. It doesn’t print though, so you will need to add that if you like.
Here’s a multithreaded version that scales pretty well (3.5x faster with 4 threads in my laptop):
function main_threaded()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 1000000
POINTS_PER_BATCH = 8
generatePoint() = rand(SVector{3, Float64}) .* (MAX - MIN) .+ MIN
generateBatch() = [generatePoint() for _ in 1:POINTS_PER_BATCH]
planes = Vector{Plane}(undef, BATCHES_COUNT)
Threads.@threads for i in 1:BATCHES_COUNT
planes[i] = best_fit_plane(generateBatch())
end
return planes
end
I have a hard time believing that it’s useful to print 1 million calculated Planes objects to screen. Printing to file, ok, but I don’t think this is particularly useful, and it messes up benchmarking.
If the objective is to just speed up the posted script, then I think the conversation is going just fine. But if an objective is to compare versions of the script in different languages, then I think the Python and Haskell code should also be posted. I myself won’t be able to make language comparisons, but I’m sure someone else can.
Besides, they may spot performance differences that are happening because code that looks equivalent actually isn’t. A (probably unrelated) example is how array-slicing bracket-syntax makes copies in Julia but views in Python.
A note: In my posted example, roughly 80%-90% of the time is now spent inside svd. If you want more speed-ups, you should consider whether you can do better than the standard svd.
using LinearAlgebra, Statistics, Random, StaticArrays
struct Plane
point::SVector{3,Float64}
normal::SVector{3,Float64}
end
function mean_center(points::SMatrix)
centroid = mean(points,dims=2)
points_centered = points .- centroid
centroid, points_centered
end
function best_fit_plane(points::SMatrix)
centroid, centered_points = mean_center(points)
u, = svd(centered_points) # you can leave _, _ if you are interested in the first output only
Plane(centroid, u[:,end])
end
function main()
MIN = -2000
MAX = 2000
BATCHES_COUNT = 10000
POINTS_PER_BATCH = 8
generateBatch() = MIN .+ (MAX - MIN)*rand(SMatrix{3,POINTS_PER_BATCH,Float64})
out = Vector{Plane}(undef,BATCHES_COUNT) # initialize output
for i in eachindex(out)
b = generateBatch()
out[i] = best_fit_plane(b)
end
return out
end
I tried speeding up the code by storing the batch data in a SMatrix from StaticArrays but it seems that they use the same SVD as in julia base. So no significant speed up. The generation of the batch data is slightly faster though.
@Yury_Kotlyarov if you are working with computational geometry, consider Meshes.jl. It should be a performant api for geometrical objects and meshes. Your Point type is available there for example, as well as many other types.
For data with only 3 dimensions I think it will be hard to make it faster, otherwise truncated svd or RandomizedLinAlg might be worth a try. Using MKL and Float32 instead of Float64 can speed up svd quite a bit.
Since we are only interested in one part of the SVD, i.e. U, we can calculate the SVD of MM’ like shown here. The SVDs of MM’ and M share U.
function best_fit_plane(points::SMatrix)
centroid, centered_points = mean_center(points)
m = centered_points*transpose(centered_points)
u, = svd(m) # you can leave _, _ if you are interested in the first output only
Plane(centroid, u[:,end])
end
The gain is noticeably, but should be even higher if we increase POINTS_PER_BATCH.
function best_fit_plane(points::SMatrix)
centroid, centered_points = mean_center(points)
m = centered_points*transpose(centered_points)
u, = svd(m) # you can leave _, _ if you are interested in the first output only
Plane(centroid, u[:,end])
end
One needs to check that I have made no errors in the implementation or my train of thought, but with this trick I can bring down the time of the main loop from 15s to 4s using