Help speeding up a test progam from a book

I am reading the book, Exploring Raspberry Pi - Interfacing to the Real World with Embedded Linux by Derek Molloy. In Chapter 5, Programming on the Raspberry Pi, the author does a speed test to compare different languages (see the image). I will list my version of his C++ program below (with the C++ code commented out).

Here are my results [running on a Raspberry Pi 3, Rasbian, 32-bit Julia] - which are not bad - better than 1/2 of them, but I think it can be better.

julia-user@NODE-RPI3:~/julia-0.6.0/bin $ ./julia n-body.jl
-0.169075164
-0.169083134
 20.172162 seconds (38.14 k allocations: 1.559 MiB)

Here is the program (again - C++ simply commented out).

#/* The Computer Language Benchmarks Game
# * http://benchmarksgame.alioth.debian.org/
# *
# * contributed by Christoph Bauer
# *  
# */

##include <math.h>
##include <stdio.h>
##include <stdlib.h>

##define pi 3.141592653589793
##define solar_mass (4 * pi * pi)
##define days_per_year 365.24

const pi = 3.141592653589793
const solar_mass = (4 * pi * pi)
const days_per_year = 365.24

#struct planet {
#  double x, y, z;
#  double vx, vy, vz;
#  double mass;
#};

mutable struct planet
	x::Float64
	y::Float64
	z::Float64
	vx::Float64
	vy::Float64
	vz::Float64
	mass::Float64
end

#void advance(int nbodies, struct planet * bodies, double dt)
#{
#  int i, j;
#
#  for (i = 0; i < nbodies; i++) {
#    struct planet * b = &(bodies[i]);
#    for (j = i + 1; j < nbodies; j++) {
#      struct planet * b2 = &(bodies[j]);
#      double dx = b->x - b2->x;
#      double dy = b->y - b2->y;
#      double dz = b->z - b2->z;
#      double distance = sqrt(dx * dx + dy * dy + dz * dz);
#      double mag = dt / (distance * distance * distance);
#      b->vx -= dx * b2->mass * mag;
#      b->vy -= dy * b2->mass * mag;
#      b->vz -= dz * b2->mass * mag;
#      b2->vx += dx * b->mass * mag;
#      b2->vy += dy * b->mass * mag;
#      b2->vz += dz * b->mass * mag;
#    }
#  }
#  for (i = 0; i < nbodies; i++) {
#    struct planet * b = &(bodies[i]);
#    b->x += dt * b->vx;
#    b->y += dt * b->vy;
#    b->z += dt * b->vz;
#  }
#}

function advance(nbodies::Int, bodies::Vector{planet}, dt::Float64)
	for i = 1:nbodies
		b::planet = bodies[i]
		for j = i + 1: nbodies
		  b2::planet = bodies[j]
		  dx::Float64 = b.x - b2.x
		  dy::Float64 = b.y - b2.y
		  dz::Float64 = b.z - b2.z
		  distance::Float64 = sqrt(dx * dx + dy * dy + dz * dz)
		  mag::Float64 = dt / (distance * distance * distance)
		  b.vx -= dx * b2.mass * mag
		  b.vy -= dy * b2.mass * mag
		  b.vz -= dz * b2.mass * mag
		  b2.vx += dx * b.mass * mag
		  b2.vy += dy * b.mass * mag
		  b2.vz += dz * b.mass * mag
		end
	end
  for i = 1:nbodies
    b::planet = bodies[i]
    b.x += dt * b.vx
    b.y += dt * b.vy
    b.z += dt * b.vz
  end
end



#double energy(int nbodies, struct planet * bodies)
#{
#  double e;
#  int i, j;
#
#  e = 0.0;
#  for (i = 0; i < nbodies; i++) {
#    struct planet * b = &(bodies[i]);
#    e += 0.5 * b->mass * (b->vx * b->vx + b->vy * b->vy + b->vz * b->vz);
#    for (j = i + 1; j < nbodies; j++) {
#      struct planet * b2 = &(bodies[j]);
#      double dx = b->x - b2->x;
#      double dy = b->y - b2->y;
#      double dz = b->z - b2->z;
#      double distance = sqrt(dx * dx + dy * dy + dz * dz);
#      e -= (b->mass * b2->mass) / distance;
#    }
#  }
#  return e;
#}

function energy(nbodies::Int, bodies::Vector{planet})
	e::Float64 = 0.0
	for i = 1:nbodies
		b::planet = bodies[i]
		e += 0.5 * b.mass * (b.vx * b.vx + b.vy * b.vy + b.vz * b.vz)
		for j = i + 1:nbodies
			b2::planet = bodies[j]
			dx::Float64 = b.x - b2.x
			dy::Float64 = b.y - b2.y
			dz::Float64 = b.z - b2.z
			distance::Float64 = sqrt(dx * dx + dy * dy + dz * dz)
			e -= (b.mass * b2.mass) / distance
	   end
	end
	
	return e
end

#void offset_momentum(int nbodies, struct planet * bodies)
#{
#  double px = 0.0, py = 0.0, pz = 0.0;
#  int i;
#  for (i = 0; i < nbodies; i++) {
#    px += bodies[i].vx * bodies[i].mass;
#    py += bodies[i].vy * bodies[i].mass;
#    pz += bodies[i].vz * bodies[i].mass;
#  }
#  bodies[0].vx = - px / solar_mass;
#  bodies[0].vy = - py / solar_mass;
#  bodies[0].vz = - pz / solar_mass;
#}

function offset_momentum(nbodies::Int, bodies::Vector{planet})
  px::Float64 = 0.0
  py::Float64 = 0.0
  pz::Float64 = 0.0

  for i = 1:nbodies
    px += bodies[i].vx * bodies[i].mass
    py += bodies[i].vy * bodies[i].mass
    pz += bodies[i].vz * bodies[i].mass
  end
  bodies[1].vx = - px / solar_mass
  bodies[1].vy = - py / solar_mass
  bodies[1].vz = - pz / solar_mass
end

##define NBODIES 5
const NBODIES = 5

#struct planet bodies[NBODIES] = {
#  {                               /* sun */
#    0, 0, 0, 0, 0, 0, solar_mass
#  },
#  {                               /* jupiter */
#    4.84143144246472090e+00,
#    -1.16032004402742839e+00,
#    -1.03622044471123109e-01,
#    1.66007664274403694e-03 * days_per_year,
#    7.69901118419740425e-03 * days_per_year,
#    -6.90460016972063023e-05 * days_per_year,
#    9.54791938424326609e-04 * solar_mass
#  },
#  {                               /* saturn */
#    8.34336671824457987e+00,
#    4.12479856412430479e+00,
#    -4.03523417114321381e-01,
#    -2.76742510726862411e-03 * days_per_year,
#    4.99852801234917238e-03 * days_per_year,
#    2.30417297573763929e-05 * days_per_year,
#    2.85885980666130812e-04 * solar_mass
#  },
#  {                               /* uranus */
#    1.28943695621391310e+01,
#    -1.51111514016986312e+01,
#    -2.23307578892655734e-01,
#    2.96460137564761618e-03 * days_per_year,
#    2.37847173959480950e-03 * days_per_year,
#    -2.96589568540237556e-05 * days_per_year,
#    4.36624404335156298e-05 * solar_mass
#  },
#  {                               /* neptune */
#    1.53796971148509165e+01,
#    -2.59193146099879641e+01,
#    1.79258772950371181e-01,
#    2.68067772490389322e-03 * days_per_year,
#    1.62824170038242295e-03 * days_per_year,
#    -9.51592254519715870e-05 * days_per_year,
#    5.15138902046611451e-05 * solar_mass
#  }
#};

bodies = Vector{planet}(NBODIES)
bodies[1] = planet( 	# sun
					0, 0, 0, 0, 0, 0, solar_mass
					)
bodies[2] = planet(		#jupiter
					4.84143144246472090e+00,
					-1.16032004402742839e+00,
					-1.03622044471123109e-01,
					1.66007664274403694e-03 * days_per_year,
					7.69901118419740425e-03 * days_per_year,
					-6.90460016972063023e-05 * days_per_year,
					9.54791938424326609e-04 * solar_mass
					)
bodies[3] = planet(	#saturn
					8.34336671824457987e+00,
					4.12479856412430479e+00,
					-4.03523417114321381e-01,
					-2.76742510726862411e-03 * days_per_year,
					4.99852801234917238e-03 * days_per_year,
					2.30417297573763929e-05 * days_per_year,
					2.85885980666130812e-04 * solar_mass
					)
bodies[4] = planet(	#uranus
					1.28943695621391310e+01,
					-1.51111514016986312e+01,
					-2.23307578892655734e-01,
					2.96460137564761618e-03 * days_per_year,
					2.37847173959480950e-03 * days_per_year,
					-2.96589568540237556e-05 * days_per_year,
					4.36624404335156298e-05 * solar_mass					
					)
bodies[5] = planet(	#neptune
					1.53796971148509165e+01,
					-2.59193146099879641e+01,
					1.79258772950371181e-01,
					2.68067772490389322e-03 * days_per_year,
					1.62824170038242295e-03 * days_per_year,
					-9.51592254519715870e-05 * days_per_year,
					5.15138902046611451e-05 * solar_mass
					)

#int main(int argc, char ** argv)
#{
#  int n = atoi(argv[1]);
#  int i;
#
#  offset_momentum(NBODIES, bodies);
#  printf ("%.9f\n", energy(NBODIES, bodies));
#  for (i = 1; i <= n; i++)
#    advance(NBODIES, bodies, 0.01);
#  printf ("%.9f\n", energy(NBODIES, bodies));
#  return 0;
#}

function main(iterations::Int)
  n::Int = iterations

  offset_momentum(NBODIES, bodies);
  @printf("%.9f\n", energy(NBODIES, bodies))
  for i = 1:n
	advance(NBODIES, bodies, 0.01)
  end
  @printf("%.9f\n", energy(NBODIES, bodies))
end

@time main(5000000)

BTW - on my Dell Optiplex 755 Intel Dual Core E6580 @ 3 GHz (Debian 9.1 - kernel 4.9) - my results ranked about the same on the chart.

julia-user@NODE-DELL755:~/julia-0.6.0/bin$ ./julia n-body.jl
-0.169075164
-0.169083134
  2.132114 seconds (38.14 k allocations: 1.557 MiB)

1 Like

This won’t speed up your code, but it will make it more compact. In many cases, you don’t need to declare types. What is important is that the type is stable throughout your code. For example, you can write b2::planet = bodies[j] as b2 = bodies[j]. Last time I checked, there was a performance regression with enumerate(). Once that is fixed, you can use that to iterate over objects and indices without loss in performance, e.g.:

for (i,body) in enumerate(bodies)

end

I don’t think you need to declare types in your function argument definition either, unless you plan to use multiple dispatch. I didn’t notice any major performance issues, but I am also not the best at optimizing.

Ok, thanks. I was basically trying to duplicate the C++ code, so it would be a fair comparison. I didn’t know if there were an “Julia” options on the command line that might be used to speed things up (or some other tricks).

This will probably not impact speed but I think the code is more pretty this way:

const pi = 3.141592653589793
const solar_mass = (4 * pi * pi)
const days_per_year = 365.24

using StaticArrays

const Vec3 = SVector{3, Float64}

mutable struct Planet
    x::Vec3
    v::Vec3
	mass::Float64
end

function advance(bodies::AbstractVector{Planet}, dt::Float64)
    nbodies = length(bodies)
	for i = 1:nbodies
		b = bodies[i]
		for j = (i + 1):nbodies
		    b2 = bodies[j]
            dx = b.x - b2.x
		    distance = norm(dx)
            mag = dt / (distance*distance*distance)
		    b.v  -= dx * b2.mass * mag
            b2.v += dx * b.mass  * mag
		end
	end
    for i = 1:nbodies
        b = bodies[i]
        b.x += dt * b.v
    end
end

function energy(bodies::AbstractVector{Planet})
	e = 0.0
    nbodies = length(bodies)
	for i = 1:nbodies
		b = bodies[i]
		e += 0.5 * b.mass * b.v ⋅ b.v
		for j = (i + 1):nbodies
			b2 = bodies[j]
            dx = b.x - b2.x
			e -= (b.mass * b2.mass) / norm(dx)
        end
	end
	return e
end

function offset_momentum(bodies::AbstractVector{Planet})
    p = zero(Vec3)
    for body in bodies
        p += body.v * body.mass
    end
    bodies[1].v = - p / solar_mass
end

const NBODIES = 5
const bodies = Vector{Planet}(NBODIES)
bodies[1] = Planet( 	# sun
					Vec3([0, 0, 0]),
                    Vec3([0, 0, 0]), 
                    solar_mass
					)
bodies[2] = Planet(		#jupiter
                    Vec3([4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01]),
                    Vec3([1.66007664274403694e-03, 7.69901118419740425e-03,  -6.90460016972063023e-05] * days_per_year),
					9.54791938424326609e-04 * solar_mass
					)
bodies[3] = Planet(	#saturn
					Vec3([8.34336671824457987e+00,  4.12479856412430479e+00, -4.03523417114321381e-01]),
					Vec3([-2.76742510726862411e-03, 4.99852801234917238e-03, 2.30417297573763929e-05] * days_per_year),
					2.85885980666130812e-04 * solar_mass
					)
bodies[4] = Planet(	#uranus
					Vec3([1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01]),
					Vec3([2.96460137564761618e-03 , 2.37847173959480950e-03 ,-2.96589568540237556e-05] * days_per_year),
					4.36624404335156298e-05 * solar_mass					
					)
bodies[5] = Planet(	#neptune
					Vec3([1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01]),
					Vec3([2.68067772490389322e-03 , 1.62824170038242295e-03 ,-9.51592254519715870e-05] * days_per_year),
					5.15138902046611451e-05 * solar_mass
					)


function main(iterations::Int)
    n = iterations

    offset_momentum(bodies);
    @printf("%.9f\n", energy(bodies))
    for i = 1:n
        advance(bodies, 0.01)
    end
    @printf("%.9f\n", energy(bodies))
end

@time main(5000000)
2 Likes

It sped it up a little:

julia> @time main(5000000)
-0.169180010
-0.169195398
  2.054813 seconds (24 allocations: 320 bytes)

This is a bit advanced, but you could try the https://github.com/simonster/StructsOfArrays.jl package, which allows you to write code as if your data was organized as an array of structs but actually organize it in memory as a struct of arrays. Better memory locality can significantly improve performance. It’s a neat trick that requires pretty hairy code changes in other languages and is just a drop-in replacement in Julia.

You could try peppering some @inbounds and @simd on some of the loops to avoid bounds checks (which I presume that C++ isn’t doing) and give the compiler permission to reorder floating-point operations in order to vectorize more effectively.

1 Like

The optimisations Stefan mentioned normally work well, but here they don’t quite work. Since Planet is mutable, StructsOfArrays.jl won’t work and rewriting it so that Planet is immutable and using StructsOfArrays doesn’t bring a performance increase. @simd and @inbounds on their own don’t do anything since the array access isn’t dominating the loop and the loop isn’t easily vectorisable.

I recommend using @profile and ProfileView.jl with a combination of @btime from BenchmarkTools.jl to make informed refactoring decisions. My computer was noisy and I was getting about 2s as well on my laptop, but the actual runtime was more like 1s. BenchmarkTools is your friend :slight_smile:

profiling tells us that most of the programs time is spend in:

distance = norm(dx)
mag = dt / (distance*distance*distance)

Now if I had time I would be looking at what Clang produces as LLVM IR versus Julia for just that small snippet.

Looking at that page from the book, I am more interested in the comparison between ARM ad 64 bit i7
In the HPC world there is a huge interest in the ARM platform and some real supercomputers running it (for example Isambard in the UK GW4 Isambard - GW4 )

Raspberry Pi 3 uses an ARM Cortex-A53 processor, with 512 KB shared L2 cache This is a 64 bit architecture - bu tlooks like the OS is 32 bit.
Is there really a 10x penalty for running eg. Python on the ARM platform? I guess this benchmarking exercise was not aimed at looking at this comparison directyl, rather at using the i7 as a ‘gold standard’

We arre probably bringing a knife to a gun fight here though. The lowest spec i7has a 6MB L3 cache. How much memory does this program use?

John, that’s why I have been doing a lot of work on bringing GPIO, PWM, etc. functionality to these platforms. I think the Julia folks may be missing a huge opportunity by not having some focus on these linux based development boards (which so many are used for some science purpose).

I am running 32-bit Julia on my development boards (on a Beowulf cluster) because it was the lowest common denominator (I have a Beaglebone Black, Udoo x86, Raspberry Pi 3, and NanoPi Duo). I guess my reason for posting the code is that the 2nd bullet on the Summary of Julia Features is : Good performance, approaching that of statically-compiled languages like C. If you have to do all kinds of tricks to make your program get that kind of speed - then I think the bullet is misleading.

I, basically, tried to do a one-for-one translation of the C code to Julia - to make a fair comparison.

Looking at the generated code it looks very similar except:

c++ with clang:

   ...
    vaddsd  %xmm1, %xmm0, %xmm1
    vxorps  %xmm0, %xmm0, %xmm0
    vsqrtsd %xmm1, %xmm0, %xmm0
    vucomisd    %xmm0, %xmm0
    vmulsd  %xmm0, %xmm0, %xmm1
   ...

julia:

   ...
    vaddsd  %xmm5, %xmm4, %xmm4
    vsqrtsd %xmm4, %xmm4, %xmm4
    vmulsd  %xmm4, %xmm4, %xmm5
   ...

It looks like the vxorps %xmm0, %xmm0, %xmm0 is there to avoid a partial register stall (e.g. https://groups.google.com/d/msg/golang-codereviews/ogVDIrYL4cc/z9pE862qBwAJ).

Perhaps this is fixed in a later LLVM build? Perhaps @yuyichao can give more info since he is so good at this stuff…

2 Likes

You are right about that. Yuyichao knows his stuff!!

Presumably you’ve seen this, but in case not:

https://juliaberry.github.io/

I don’t know what “some focus” means, exactly, but please understand that it is amazing that Julia on ARM is simply an apt-get away these days (thanks to Yichao, Viral, Avik, Simon, and many others). When we started working on ARM, the only option was a local build which took several days if you were lucky and there were no errors halfway through requiring a reconfigure and rebuild.

2 Likes

Wow. I apologize - no I didn’t see that. That’s great!

The stuff I’ve been working on focuses more on sysfs (which is slower), but it is common (and somewhat standard) across linux devices.

Hi,
I got significant speedups by the following two optimizations:

function advance(bodies::AbstractVector{Planet}, dt::Float64)
    nbodies = length(bodies)
	for i = 1:nbodies
		b = bodies[i]
		for j = (i + 1):nbodies
		    b2 = bodies[j]
            dx = b.x - b2.x
            d2 = sum(dx.*dx)        
            dist = sqrt(d2)                     
            mag = dt / (d2*d2)    
            mag *= dist
            
		    b.v  -= dx * (b2.mass * mag)
            b2.v += dx * (b.mass  * mag)
		end
	end
    for i = 1:nbodies
        b = bodies[i]
        b.x += dt * b.v
    end
end


function main(iterations::Int)
    n = iterations

    offset_momentum(bodies);
    #@printf("%.9f\n", energy(bodies))
    for i = 1:n
        advance(bodies, 0.01)
    end
    #@printf("%.9f\n", energy(bodies));
    return nothing
end

I did three things: Firstly, I added parentheses to the update of the velocities. This made no difference in the generated code because llvm is too smart; however, you want to compute the product mass*mag only once, not three times, and this way the code is obviously correct (and not surprisingly correct).

Secondly, I removed the @printf in main. I do not know why, but the inliner was extremely unhappy if this is not done.

Thirdly, you will see that I changed the computation of mag. If you look at the generated native code, you will see that the original version emitted two instructions (square root and division) with high latency, which depend on each other. This way, the division and the square root are computed in parallel.

In practice, you might consider doing the division and square root in single precision. I know that this defies the point of this exercise, but Float64 division and square root are really really slow, so consider whether you get away with cheaper OPs. I am not so sure about hamiltonian integrators (preserving energy, momentum, angular momentum; not my field), but the errors accumulated by the time-discretization are much, much larger than the floating point errors anyway.

1 Like

Mixed-precision algorithms are very hot topic in applied math right now.

https://nickhigham.wordpress.com/2015/10/20/the-rise-of-mixed-precision-arithmetic/

I think we should explore this more in Julia.

It’s a common misconception that symplectic integrators “exactly” preserve energy, angular momentum, etc. They don’t at all. Instead, they ensure that they integrate on some symplectic manifold, meaning that they have a form of almost-exact periodicity. But since it’s not necessarily the right symplectic manifold (it could be off by the order of the error), you will see (small) oscillations in the energy and other properties. Some examples can be found in GitHub - SciML/SciMLTutorials.jl: Tutorials for doing scientific machine learning (SciML) and high-performance differential equation solving with open source software. .

5 Likes

What you really want to do is emit the juicy RSQRTSS and RCPSS instructions (on x86) and their vectorized counterparts. In terms of speedup, single precision is chump change compared to these beautiful bastards (compute approximate reciprocal square-root and approx reciprocal of floats for almost the same price as multipliation, as of http://www.agner.org/optimize/instruction_tables.pdf).

Which brings me to the question: Is there a sensible way of accessing these instructions in Julia? Like @really_fast_math? Or as @inline RSQRTSS(x::Float) = something, which hopefully compiles to the single fast instruction on architectures where it exists, and otherwise to 1/sqrt(x)? (we would need to figure out whether llvm understands this operation and possibly do some version of asm - but julia/llvm need to understand this well enough to optimize out boilerplate/ select a free register/ etc, so a native x86 emit does not cut the cake)

Re symplectic integrators: I kinda remember that the long-term energy non-conservation due to time-discretization is exponentially small in the time-step, because all sorts of things cancel. Floating point errors are different, and hence might blow up in our face. If you say that this is not an issue, then I will believe you; I do not remember the detailed proofs for approximate energy conservation, but I do remember writing into the margins of my textbook that its proof was not directly applicable to floating point errors. This does not preclude better theorems being available in the literature; as I said, symplectic integrators are not my field.
(approximate evaluations of an exact function are not the same as exact evaluations of an approximate function, especially if this function and the approximate function have extra structure like smoothness, analyticity, equivariance under some group, symplecticity, exactness (as a differential form), closedness (exact and no de-rham)).

This question becomes very relevant if we perform the square root with the ~15 bits of precision given by RSQRTSS.

There is @fastmath which can replace a bunch of operations with faster things like FMA and less precise but faster math functions. I’d check to see if that does anything here?

Well there’s two different things going on here. There’s divergence from the symplectic manifold which is analytically zero but technically non-zero due to floating point issues. But then symplectic and energy conservation are not the same thing. That’s shown very clearly in the Kepler example where a 6th order symplectic integrator is shown oscillating the energy (at a small but proportional to the error amount) but displays a form of periodicity (but with the slight linear drift mentioned before).

http://nbviewer.jupyter.org/github/JuliaDiffEq/DiffEqTutorials.jl/blob/master/PhysicalModels/KeplerProblem.ipynb

Floating point errors are not even close to the errors one gets from almost any DE discretization unless you’re working in the extreme.

2 Likes

I thought it might be helpful to state this as simply as possible based on some reading I just did, for the rest of us non-experts (I’m sure @ChrisRackauckas will correct me if I’m mistaken):

Symplectic integrators conserve the symplectic two-form dq\wedge dp by construction (subject only to machine error), but typically introduce an error term to the physical Hamiltonian
H \to H + O(\Delta t)

3 Likes

Unfortunately, we don’t get the low-precision version with @fastmath (on my machine @fastmath even produced a slow-down).

In fact I could not get julia to emit this at all, but llvm wants to emit reciprocal squareroot + iterative improvement instead of 1/sqrt. Whether this is an improvement over the built-in square root instruction and one divide is very architecture-dependent. Only if our application is happy with the abysmal precision of the machine reciprocal squareroot, then we get the really big improvements - but, as far as I know, there is no C compiler that optimizes anything to a naked rsqrtss, under any compiler options.

I don’t know whether there is any way of getting llvm to emit a naked rsqrtss.
See https://www.bountysource.com/issues/13241137-optimize-reciprocal-square-root-with-fast-math-x86

Floating point errors are not even close to the errors one gets from almost any DE discretization unless you’re working in the extreme.

Exactly. However, I had guessed that ~15 bits of precision for the fast reciprocal sqrt vs analytic zero drift from the symplectic manifold is pretty extreme, especially if you don’t really care about the (typically ill-posed) initial value problem but instead want to look at other properties (hence don’t really care about discretization errors that almost preserve conserved quantities, attractors, ergodic measures on attractors, lyapunov exponents, relevant bifurcations, relevant symmetries, whatever you actually care about).

On the other hand, at least in this example, approximate energy conservation over this timescale still works if we do the square root in Float16. And switching the square-root and division to Float32 gives another nice performance-boost (still slower than what rsqrtss should give us, but meh)

The point where sloppy divisions and squareroots should really shine is systems without extra structure: If your discretization error is of order 10 bits, then everything above 16 bits of precision for your flops is wasted.

Since you obviously know more about integrators than me and are here already, a slightly offtopic question: Why are typical linear-implicit solvers (like rosenbrock) not re-using the LU-factorization of the jacobian of the previous time-steps?

Such an approach should work for problems / phase-space regions where the second derivative is tame. We still need to compute first derivative by auto-diff in order to check whether we need to do a new LU and correct errors from using the LU of the wrong jacobian; but we can save time in computing the LU.

Alternative formulation: Are there analogs of rosenbrock methods for iterative/approximate linear solvers? We have really good starting points for iterations from the previous time-steps; it would be a shame to let these go to waste by using a de-novo (Gauss) solver.

The derivation of Rosenbrock methods assumes the Jacobian is exactly correct in its order derivation. Thus if it’s wrong, you get order loss. ROSW, or Rosenbrock-W methods, don’t need this but “exactly” but still need a good Jacobian and it’s tied to the error. If these Jacobians are too far off you will see more error.

Implicit methods only use the Jacobian in the implicit solver so that’s more of a line search which is why they can re-use Jacobians. The middle ground is (E)SDIRK methods which are line-by-line implicit while having multiple steps all using the same Jacobian I - gamma*J. These methods are thus similar to Rosenbrock methods (in fact, Rosenbrock methods can be thought of as special SDIRK methods where a single Newton step is done at each stage of the method) but robust to Jacobian reuse and thus are good when the second derivative is tame. I plan on mentioning this in a review on the methods soon.