Hi,
I work on the software that allows to call numerical solvers across different languages, which is implemented using libffi
(Function Foreign Interface library).
I was working recently on the performance study, to see how using a mediator library affects performance in comparison with just using a numerical solver from the same language in which it is written (i.e., callling, say, a Julia’s solver from a Julia script).
I got somewhat strange results, when calling OrdinaryDiffEq.jl
with the integrator DP5()
directly from Julia is /slower/ than calling it from Python via this software.
Precisely, I solve inviscid Burgers’ partial differential equation using the method of lines, where the time integration is done via OrdinaryDiffEq.jl
. I solve it multiple times to average the runtime and report the mean value with 95% confidence interval.
Here the solution directly in Julia:
using OrdinaryDiffEq
using Plots
using Printf
using Statistics
function compute_rhs(udot, u, p, t)
dx, = p
f = 0.5 * u.^2
c = maximum(abs.(u)) # Local sound speed.
f_hat = 0.5 * (f[1:end-1] + f[2:end]) .- 0.5 * c * (u[2:end] - u[1:end-1])
f_plus = f_hat[2:length(f_hat)]
f_minus = f_hat[1:length(f_hat) - 1]
udot[2:end-1] = -1.0 / dx * (f_plus - f_minus)
local_ss_rb = max(abs(u[1]), abs(u[end]))
f_rb = 0.5 * (f[1] + f[end]) - 0.5 * local_ss_rb * (u[1] - u[end])
f_lb = f_rb
udot[1] = -1.0 / dx * (f_minus[1] - f_lb)
udot[end] = -1.0 / dx * (f_rb - f_plus[end])
end
function run()
N = 4000
x = collect(range(0, 2, N + 1))
dx = 2 / N
u0 = 0.5 .- 0.25 * sin.(pi * x)
CFL = 0.5
dt_max = dx * CFL
t0 = 0.0
tfinal = 2.0
odeProblem = ODEProblem(compute_rhs, u0, (t0, tfinal), (dx,))
odeProblemStep = ODEProblem(compute_rhs, u0, (t0, Inf), (dx,))
times = collect(range(t0, tfinal, 11))
local solution_last::Vector{Float64}
# for i = 1:5
# K = 30
# @printf "Trial %d: estimating runtime from %d runs\n" i K
# elapsed_times = []
# for k = 1:K
# solver = init(odeProblemStep, DP5(), reltol = 1e-6, abstol = 1e-12)
# tic = time_ns()
# for t in times[2:end]
# step!(solver, t - solver.t, true)
# end
# toc = time_ns()
# push!(elapsed_times, (toc - tic) / 1e9)
# solution_last = solver.u
# end
# runtime_mean = mean(elapsed_times)
# runtime_std = std(elapsed_times; corrected=true, mean=runtime_mean)
# sem = runtime_std / sqrt(length(elapsed_times))
# ci = 2 * sem
# @printf "Runtime, sec: %.4f ± %.4f\n" runtime_mean ci
# @printf "Solution second point from the left value: %.16f\n" solution_last[2]
# end
for i = 1:5
K = 30
@printf "Trial %d: estimating runtime from %d runs\n" i K
elapsed_times = []
for k = 1:K
tic = time_ns()
solution = solve(odeProblem, DP5(), reltol = 1e-6, abstol = 1e-12)
toc = time_ns()
push!(elapsed_times, (toc - tic) / 1e9)
solution_last = solution.u[end]
end
runtime_mean = mean(elapsed_times)
runtime_std = std(elapsed_times; corrected=true, mean=runtime_mean)
sem = runtime_std / sqrt(length(elapsed_times))
ci = 2 * sem
@printf "Runtime, sec: %.4f ± %.4f\n" runtime_mean ci
@printf "Solution second point from the left value: %.16f\n" solution_last[2]
end
p = plot(x, solution_last, linewidth=3)
display(p)
end
run()
with the following results:
Trial 1: estimating runtime from 30 runs
Runtime, sec: 1.0588 ± 0.0553
Solution second point from the left value: 0.5003054078352265
Trial 2: estimating runtime from 30 runs
Runtime, sec: 1.0851 ± 0.0565
Solution second point from the left value: 0.5003054078352265
Trial 3: estimating runtime from 30 runs
Runtime, sec: 1.1060 ± 0.0591
Solution second point from the left value: 0.5003054078352265
Trial 4: estimating runtime from 30 runs
Runtime, sec: 1.1252 ± 0.0597
Solution second point from the left value: 0.5003054078352265
Trial 5: estimating runtime from 30 runs
Runtime, sec: 1.1070 ± 0.0600
Solution second point from the left value: 0.5003054078352265
And this is the output from running the same OrdinaryDiffEq.jl
integrator DP5
from Python, that is the ODE system is defined in Python and passed to Julia as a C callback (that wraps the Python callback):
import argparse
import dataclasses
import os
import time
import numpy as np
from oif.interfaces.ivp import IVP
from scipy import integrate
class BurgersEquationProblem:
r"""
Problem class for inviscid Burgers' equation:
$$
u_t + 0.5 * (u^2)_x = 0, \quad x \in [0, 2], \quad t \in [0, 2]
$$
with initial condition :math:`u(x, 0) = 0.5 - 0.25 * sin(pi * x)`
and periodic boundary conditions.
Parameters
----------
N : int
Grid resolution.
"""
def __init__(self, N=101):
self.N = N
self.x, self.dx = np.linspace(0, 2, num=N + 1, retstep=True)
self.u = 0.5 - 0.25 * np.sin(np.pi * self.x)
self.invariant = np.sum(np.abs(self.u))
self.cfl = 0.5
self.dt_max = self.dx * self.cfl
self.t0 = 0.0
self.tfinal = 2.0
self.udot = np.empty_like(self.u)
def compute_rhs(self, t, u: np.ndarray, udot: np.ndarray, ___) -> None:
dx = self.dx
f = 0.5 * u**2
local_ss = np.maximum(np.abs(u[0:-1]), np.abs(u[1:]))
local_ss = np.max(np.abs(u))
f_hat = 0.5 * (f[0:-1] + f[1:]) - 0.5 * local_ss * (u[1:] - u[0:-1])
f_plus = f_hat[1:]
f_minus = f_hat[0:-1]
udot[1:-1] = -1.0 / dx * (f_plus - f_minus)
local_ss_rb = np.maximum(np.abs(u[0]), np.abs(u[-1]))
f_rb = 0.5 * (f[0] + f[-1]) - 0.5 * local_ss_rb * (u[0] - u[-1])
f_lb = f_rb
udot[+0] = -1.0 / dx * (f_minus[0] - f_lb)
udot[-1] = -1.0 / dx * (f_rb - f_plus[-1])
def compute_rhs_scipy(self, t, u):
self.compute_rhs(t, u, self.udot, None)
return self.udot
def main():
problem = BurgersEquationProblem(N=4000)
s = IVP("jl_diffeq")
t0 = 0.0
y0 = problem.u
s.set_initial_value(y0, t0)
times = np.linspace(problem.t0, problem.tfinal, num=11)
for i in range(0, 5):
K = 30
print("Trial {:d}: estimating runtime from {:d} runs\n".format(i, K))
print(f"Measure performance {K} times")
elapsed_times = []
solution_last = None
for k in range(K):
s.set_initial_value(y0, t0)
tic = time.perf_counter()
for t in times[1:]:
s.integrate(t)
toc = time.perf_counter()
elapsed_times.append(toc - tic)
solution_last = s.y
mean = np.mean(elapsed_times)
sem = np.std(elapsed_times, ddof=1) / np.sqrt(len(elapsed_times))
ci = 2.0 * sem # Coefficient corresponds to the 95% Confidence Interval.
print(f"Runtime, sec: {mean:.4f} ± {ci:.4f}")
print(f"Solution second point from the left value: {solution_last[1]:.16f}")
if __name__ == "__main__":
main()
with the following output:
Trial 1: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.9764 ± 0.0475
Solution second point from the left value: 0.5003054078322573
Trial 2: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.9422 ± 0.0489
Solution second point from the left value: 0.5003054078322573
Trial 3: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.9329 ± 0.0620
Solution second point from the left value: 0.5003054078322573
Trial 4: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.9138 ± 0.0491
Solution second point from the left value: 0.5003054078322573
Trial 5: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.8905 ± 0.0451
Solution second point from the left value: 0.5003054078322573
Note how running Julia’s time integrator from Python with the overhead of wrapping a Python callback in C and calling it from Julia through a mediator library, somehow outperforms calling Julia’s code directly.
The question is, why this happens? Can anybody point me to somehting that I do not understand in Julia that could help improve its performance. I need this comparison for a scientific paper, therefore, I’d like to be maximally sure that I do things as accurate as possible.
Also, I ran from Python SciPy’s dopri5
(which is in principle should be the same as DP5
in OrdinaryDiffeq.jl
), and it 30% faster than Julia’s code, although it is not even a native code but a wrapped Fortran from, AFAIK, 1980s:
Trial 1: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.7381 ± 0.0124
Solution second point from the left value: 0.5003054078346969
Trial 2: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.7433 ± 0.0094
Solution second point from the left value: 0.5003054078346969
Trial 3: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.7466 ± 0.0086
Solution second point from the left value: 0.5003054078346969
Trial 4: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.7382 ± 0.0111
Solution second point from the left value: 0.5003054078346969
Trial 5: estimating runtime from 30 runs
Measure performance 30 times
Runtime, sec: 0.7367 ± 0.0109
Solution second point from the left value: 0.5003054078346969
Note that I report everywhere the solution at the same grid point to make sure that I solve the same problem with the same method (tolerances, etc.), which all agree to 16 significant digits when I use DP5
from Julia and Python, and to about 12 significant digits when I call dopri5
in Python.