Turned the fastest code in this thread in a small self contained package, in case someone needs it.
Original Content of the post
using BenchmarkTools
@fastmath function trapz(x::T1, y::T2) where {fT,T1<:AbstractVector{fT},T2<:AbstractVector{fT}}
n = length(x)
r = zero(fT)
if n == 1; return r; end
for i in 2:n-1
@inbounds r += (x[i+1]-x[i-1]) * y[i]
end
@inbounds r += (x[end]-x[end-1]) * y[end] + (x[2] - x[1])*y[1]
trapz_int = r/2
return trapz_int
end
function multitrapz(V,T)
A=copy(T)
for (i,x) in enumerate(V)
if x == nothing; continue; end
A=mapslices(par->trapz(x,par), A, dims=i)
end
A
end
vx=0:0.01:1
vy=0:0.01:2
vz=0:0.01:3
M=[x^2+y^2+z^2 for x=vx,y=vy,z=vz]
@btime multitrapz((vx,vy,vz),M)
@btime multitrapz((vx,nothing,vz),M)
i wonder if there is a clever way to not perform all the allocations, could someone help me?
195.506 ms (729766 allocations: 64.72 MiB)
195.005 ms (728515 allocations: 64.69 MiB)
the functionality implemented is very convenient since i can perform the integration on arbitrary dimensions, skipping some dimension if necessary,
if there is already a package implementing a similar function please let me know
Yeah, it seems to be missing some good demos. This doesn’t work due to a bug:
function multitrapz(V,T::AbstractArray{TT,N}) where {TT,N}
A=copy(T)
slicing = (True(), Base.tail(ntuple(i->False(), Val(N)))...)
for (i,x) in enumerate(V)
if x != nothing
A=map(par->trapz(x,par), Slices(A, slicing...))
end
slicing = (Base.tail(slicing)..., slicing[1]) # cycle the slicing pattern
end
A
end
To make it truly inferrable you’d be better off using lispy tuple programming rather than writing a loop.
7 allocations per slice, that’s not completely strange. JuliennedArrays, were the bug to be fixed, might knock that down some but not by an enormous amount.
Memory allocations are not evil in and of themselves. I would profile and check performance before worrying about allocation.
Thank you so much Tim for your suggestions, Your tuply lispy approach cuts down memory allocations, and also reduces execution time!
new code with both versions
using BenchmarkTools
function trapz(x::T1, y::T2) where {fT,T1<:AbstractVector{fT},T2<:AbstractVector{fT}}
n = length(x)
r = zero(fT)
if n == 1; return r; end
for i in 2:n-1
@inbounds r += (x[i+1]-x[i-1]) * y[i]
end
@inbounds r += (x[end]-x[end-1]) * y[end] + (x[2] - x[1])*y[1]
trapz_int = r/2
return trapz_int
end
""" First Attempt """
function multitrapz(V,T)
A=copy(T)
for (i,x) in enumerate(V)
if x == nothing; continue; end
A=mapslices(par->trapz(x,par), A, dims=i)
end
A
end
""" Second Attempt """
function Ttrapz(V::Tuple,D::Tuple,T::AbstractArray{TT,N}) where {TT,N}
HV=V[1]
HD=D[1]
restV=Base.tail(V)
restD=Base.tail(D)
if (HV==nothing || HD==nothing); return Ttrapz(restV,restD,T); end
Ttrapz(restV,restD,mapslices(par->trapz(HV,par), T, dims=HD))
end
function Ttrapz(V::Tuple{},D::Tuple{},T::AbstractArray{TT,N}) where {TT,N}
T
end
function Ttrapz(V::Tuple,T::AbstractArray{TT,N}) where {TT,N}
Ttrapz(V,ntuple(identity,length(V)),T)
end
vx=0:0.01:1
vy=0:0.01:2
vz=0:0.01:3
M=[x^2+y^2+z^2 for x=vx,y=vy,z=vz]
arg1=(vx,vy,vz)
arg2=(vx,nothing,vz)
@btime multitrapz($arg1,$M)
@btime multitrapz($arg2,$M)
@btime Ttrapz($arg1,$M)
@btime Ttrapz($arg2,$M)
results:
old code:
190.026 ms (729765 allocations: 64.72 MiB)
190.634 ms (728514 allocations: 64.69 MiB)
new code
145.002 ms (668962 allocations: 17.18 MiB)
145.119 ms (667807 allocations: 17.14 MiB)
Apparently numpy show stronger performance for nested integral calculation, this has something to do with the fact that the matrix is reallocated every time, ofcourse costructing the matrix M is much slower in python.
import numpy as np
N=100
P=200
Q=300
M=np.zeros((N,P,Q))
x=np.linspace(0,1,N)
y=np.linspace(0,2,P)
z=np.linspace(0,3,Q)
for (i,vx) in enumerate(x):
for (j,vy) in enumerate(y):
for (k,vz) in enumerate(z):
M[i,j,k]=vx*vx+vy*vy+vz*vz
Thank you Tim! i’m hellbent on beating numpy, so i already thought of making the code natively multidim, i will make an attempt soon!
Also thank you for the nice package TiledIterations!
using BenchmarkTools
@inline function trapz_fast(x::Colon,args...) args[1] end
@inline function trapz_fast(x::T1, y::T2,dim::Integer) where {N,fT,T1<:AbstractVector{fT},T2<:AbstractArray{fT,N}}
n = length(x)
s = size(y)
@assert 1<=dim<=N
@assert s[dim]==n
@inbounds begin
r = zeros(fT,s[1:dim-1]...,s[dim+1:N]...)
id(i) = ntuple(k->k==dim ? i : (:),N)
if n == 1; return r; end
for i in 2:n-1
@fastmath r .+= (x[i+1] - x[i-1]) .* view(y,id(i)...)
end
r .+= (x[end]-x[end-1]) .* view(y,id(n)...) + (x[2] - x[1]).* view(y,id(1)...)
return r./2
end
end
@inline function trapz(V::Tuple,T,D) @inbounds trapz(Base.tail(V),trapz_fast(V[1],T,D),ifelse(V[1]==:,1,0)+D) end
@inline function trapz(V::Tuple{},T,D); T; end
function trapz(V,T) trapz(V,T,1) end
vx=range(0,1,length=100)
vy=range(0,2,length=200)
vz=range(0,3,length=300)
M=[x^2+y^2+z^2 for x=vx,y=vy,z=vz]
arg1=(vx,vy,vz)
arg2=(vx,:,vz)
@benchmark trapz($arg1,$M)
BenchmarkTools.Trial:
memory estimate: 2.40 MiB
allocs estimate: 3920
--------------
minimum time: 54.641 ms (0.00% GC)
median time: 56.205 ms (0.00% GC)
mean time: 56.697 ms (0.18% GC)
maximum time: 65.384 ms (0.00% GC)
--------------
samples: 89
evals/sample: 1
@benchmark trapz($arg2,$M)
BenchmarkTools.Trial:
memory estimate: 2.37 MiB
allocs estimate: 3115
--------------
minimum time: 54.772 ms (0.00% GC)
median time: 55.268 ms (0.00% GC)
mean time: 55.734 ms (0.18% GC)
maximum time: 61.830 ms (0.00% GC)
--------------
samples: 90
evals/sample: 1
BenchmarkTools.Trial:
memory estimate: 2.40 MiB
allocs estimate: 3927
--------------
minimum time: 55.564 ms (0.00% GC)
median time: 56.959 ms (0.00% GC)
mean time: 57.564 ms (0.18% GC)
maximum time: 63.437 ms (1.60% GC)
--------------
samples: 87
evals/sample: 1
reached numpy speed, beaten generality of use, and shorter code implementation. Pretty happy now, there is still a type instability that i’m not able to fix though
using BenchmarkTools
@inline function bringlast(T::Tuple,el)
ifelse(el==T[1],
(bringlast(Base.tail(T),el)...,T[1]) ,
(T[1],bringlast(Base.tail(T),el)...)
)
end
@inline function bringlast(T::Tuple{},el); T; end
function trapz(x::T1, y::T2) where {N,fT,T1<:AbstractVector{fT},T2<:AbstractArray{fT,N}}
n = length(x)
s = size(y)
@assert s[end]==n
@inbounds begin
r = zeros(fT,Base.reverse(Base.tail(Base.reverse(s))))
id(i) = (ntuple(k->:,N-1)...,i)
if n == 1; return r; end
for i in 2:n-1
@fastmath r .+= (x[i+1] - x[i-1]) .* view(y,id(i)...)
end
r .+= (x[end]-x[end-1]) .* view(y,id(n)...) + (x[2] - x[1]).* view(y,id(1)...)
return r./2
end
end
function trapz(x::T1, y::T2, axis::T3) where {N,fT,T1<:AbstractVector{fT},T2<:AbstractArray{fT,N},T3<:Integer}
@assert 1<=axis<=N
trapz(x,PermutedDimsArray(y,bringlast(ntuple(identity,N),axis)))
end
BenchmarkTools.Trial:
memory estimate: 819.89 KiB
allocs estimate: 635
--------------
minimum time: 6.700 ms (0.00% GC)
median time: 6.819 ms (0.00% GC)
mean time: 6.904 ms (0.63% GC)
maximum time: 9.999 ms (0.00% GC)
--------------
samples: 724
evals/sample: 1
@benchmark trapz($vx,trapz($vy,trapz($vz,$M)))
BenchmarkTools.Trial:
memory estimate: 818.83 KiB
allocs estimate: 614
--------------
minimum time: 6.348 ms (0.00% GC)
median time: 6.501 ms (0.00% GC)
mean time: 6.666 ms (0.67% GC)
maximum time: 9.819 ms (12.43% GC)
--------------
samples: 749
evals/sample: 1
Benchmark, when used inefficiently:
This code is optimized in order to perform the integral the fastest over the last dimension first, here instead we are performing integral in opposite order e.g. first x, then y, at last over z