Making @benchmark outputs statistically meaningful, and actionable

So here is some code that you can run (after installing the necessary packages):

MWE
using StaticArrays, Static, BenchmarkTools, Random
import NLLSsolver

# Description of BAL image, and function to transform a landmark from world coordinates to pixel coordinates
struct BALImage{T}
    pose::NLLSsolver.EffPose3D{T}   # Extrinsic parameters (rotation & translation)
    sensor::NLLSsolver.SimpleCamera{T} # Intrinsic sensor parameters (just a single focal length)
    lens::NLLSsolver.BarrelDistortion{T} # Intrinsic lens parameters (k1 & k2 barrel distortion)
end
NLLSsolver.nvars(::BALImage) = static(9) # 9 DoF in total for the image (6 extrinsic, 3 intrinsic)
function NLLSsolver.update(var::BALImage, updatevec, start=1)
    return BALImage(NLLSsolver.update(var.pose, updatevec, start),
                    NLLSsolver.update(var.sensor, updatevec, start+6),
                    NLLSsolver.update(var.lens, updatevec, start+7))
end
function BALImage(rx::T, ry::T, rz::T, tx::T, ty::T, tz::T, f::T, k1::T, k2::T) where T<:Real
    return BALImage{T}(NLLSsolver.EffPose3D(NLLSsolver.Pose3D(rx, ry, rz, tx, ty, tz)), 
                       NLLSsolver.SimpleCamera(f), 
                       NLLSsolver.BarrelDistortion(k1, k2))
end
BALImage(v) = BALImage(v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9])
transform(im::BALImage, X::NLLSsolver.Point3D) = NLLSsolver.ideal2image(im.sensor, NLLSsolver.ideal2distorted(im.lens, -NLLSsolver.project(im.pose * X)))

# Residual that defines the reprojection error of a BAL measurement
struct BALResidual{T} <: NLLSsolver.AbstractResidual
    measurement::SVector{2, T} # Landmark image coordinates (x, y)
    varind::SVector{2, Int} # Variable indices for the residual (image, landmark)
end
BALResidual(m, v) = BALResidual(SVector{2}(m[1], m[2]), SVector{2, Int}(v[1], v[2]))
NLLSsolver.ndeps(::BALResidual) = static(2) # Residual depends on 2 variables
NLLSsolver.nres(::BALResidual) = static(2) # Residual vector has length 2
NLLSsolver.varindices(res::BALResidual) = res.varind
NLLSsolver.getvars(res::BALResidual{T}, vars::Vector) where T = vars[res.varind[1]]::BALImage{T}, vars[res.varind[2]]::NLLSsolver.Point3D{T}
NLLSsolver.computeresidual(res::BALResidual, im::BALImage, X::NLLSsolver.Point3D) = transform(im, X) - res.measurement
const balrobustifier = NLLSsolver.HuberKernel(2.)
NLLSsolver.robustkernel(::BALResidual) = balrobustifier
Base.eltype(::BALResidual{T}) where T = T

function makeBALproblem()
    Random.seed!(1)
    problem = NLLSsolver.NLLSProblem(Union{BALImage{Float64}, NLLSsolver.Point3D{Float64}}, BALResidual{Float64})
    numcameras = 16
    for cam in 1:numcameras
        NLLSsolver.addvariable!(problem, BALImage(0., 0., 0., 0., 0., 0., 1., 0., 0.))
    end
    for lm in 1:22106
        NLLSsolver.addvariable!(problem, NLLSsolver.Point3D(0., 0., 1.))
    end
    for cam in 1:numcameras
        for lm in 1:22106
            if rand() < 0.25
                NLLSsolver.addcost!(problem, BALResidual(SVector(0., 0.), SVector(cam, lm + numcameras)))
            end
        end
    end
    return problem
end

struct MyData
    vars::Vector{Union{BALImage{Float64}, NLLSsolver.Point3D{Float64}}}
    costs::NLLSsolver.VectorRepo{BALResidual{Float64}}
    linsystem::NLLSsolver.MultiVariateLSsparse
end
MyData(problem) = MyData(problem.variables, problem.costs, NLLSsolver.makesymmvls(problem, trues(length(problem.variables)), length(problem.variables)))
MyData() = MyData(makeBALproblem())

function evalfunc(data)
    NLLSsolver.zero!(data.linsystem)
    NLLSsolver.costgradhess!(data.linsystem, data.vars, data.costs)
end

# @benchmark evalfunc(data) setup=(data=MyData()) seconds=15
data = MyData()
@benchmark evalfunc($data)

I put this code in a file, then re-run all the code in the file several times, within the same Julia REPL instance (I’m just hitting the play button in VS Code). Here is the output of the first 5 runs, directly after starting the Julia REPL:

Benchmark results
BenchmarkTools.Trial: 110 samples with 1 evaluation.
 Range (min … max):  44.782 ms …  47.563 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     45.285 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   45.569 ms Β± 656.759 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

       β–„ β–‚β–ˆ ▁      β–ƒ                                            
  β–„β–„β–‡β–†β–ˆβ–ˆβ–ƒβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–ƒβ–ƒβ–β–„β–ƒβ–ˆβ–ƒβ–„β–†β–„β–ƒβ–ƒβ–ƒβ–β–ƒβ–ƒβ–ƒβ–„β–†β–ƒβ–ƒβ–ƒβ–„β–β–„β–„β–β–„β–ƒβ–β–β–ƒβ–β–β–β–ƒβ–β–β–β–β–„β–β–β–β–β–ƒβ–β–ƒ β–ƒ
  44.8 ms         Histogram: frequency by time         47.5 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 106 samples with 1 evaluation.
 Range (min … max):  46.271 ms …  50.480 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     46.974 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   47.254 ms Β± 797.376 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

         β–‚ β–…β–…β–ˆβ–ˆ β–†β–‚                                              
  β–…β–…β–„β–β–„β–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–β–…β–‡β–„β–β–β–β–„β–β–„β–…β–„β–β–β–β–β–„β–β–β–β–β–β–„β–…β–β–β–…β–„β–…β–…β–β–…β–„β–β–β–β–β–β–β–„β–β–„ β–„
  46.3 ms         Histogram: frequency by time         49.5 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 116 samples with 1 evaluation.
 Range (min … max):  42.548 ms …  45.545 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     43.201 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   43.205 ms Β± 316.532 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

                   ▃▁   ▁  ▁▁  β–ƒ  β–ˆ  β–ƒβ–ˆβ–β–†β–β–†β–†β–β–   ▁▁ β–ƒ β–ƒ      ▁  
  β–„β–β–β–„β–β–β–„β–β–β–β–„β–β–β–„β–„β–β–β–ˆβ–ˆβ–„β–‡β–‡β–ˆβ–‡β–β–ˆβ–ˆβ–‡β–‡β–ˆβ–‡β–„β–ˆβ–‡β–‡β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–„β–‡β–β–ˆβ–ˆβ–β–ˆβ–„β–ˆβ–„β–„β–β–‡β–„β–‡β–ˆ β–„
  42.5 ms         Histogram: frequency by time         43.6 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 108 samples with 1 evaluation.
 Range (min … max):  45.543 ms …  48.077 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     46.428 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   46.441 ms Β± 303.921 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

                       ▁ β–ƒ β–β–†β–ˆβ–ˆ β–ˆβ–ˆβ–ƒβ–ˆβ–ˆβ–ƒβ–β–†β–ˆ β–ƒ ▁                   
  β–„β–β–β–β–β–β–β–β–β–„β–β–β–β–β–β–‡β–„β–‡β–β–‡β–‡β–ˆβ–‡β–ˆβ–‡β–ˆβ–ˆβ–ˆβ–ˆβ–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–„β–ˆβ–‡β–ˆβ–„β–„β–β–„β–β–β–β–„β–β–„β–β–β–„β–„β–β–β–„ β–„
  45.5 ms         Histogram: frequency by time         47.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 92 samples with 1 evaluation.
 Range (min … max):  53.781 ms …  58.077 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     54.468 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   54.681 ms Β± 842.132 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

      β–‚β–… β–ˆ   β–‡ β–‚  ▁                                             
  β–ƒβ–ƒβ–†β–ˆβ–ˆβ–ˆβ–…β–ˆβ–†β–†β–†β–ˆβ–†β–ˆβ–ˆβ–ˆβ–ˆβ–ƒβ–…β–β–β–ƒβ–β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒβ–…β–ƒβ–ƒβ–†β–β–ƒβ–ƒβ–β–β–β–β–β–β–β–β–β–β–β–ƒ ▁
  53.8 ms         Histogram: frequency by time         57.4 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

Please note that the effect is a function of my CPU (Apple M1 Pro), my OS (MacOS), and the current memory layout of my entire system at the time the benchmark was run. There is nothing reliable about this MWE. Sometimes I don’t see any difference. I have zero expectation that you will see the differences I see, or that this MWE will be of any real value as a result.

What I have done is email Prof. Berger and asked if he has any simple code that pathologically demonstrates the problem. It should be possible to write such code. This code will not be normal code. The memory layout will not be a normal layout.

In fact, one MWE is just one sample of code pattern and memory layout in an infinite space of such patterns. It cannot tell you how susceptible the patterns you use are to this problem. It is of no practical use beyond showing that a benchmark tool is invariant to this issue.

That’s why I think this isn’t of much use. Anyway, here you go!

@mbauman I also ran the benchmark using the setup block to regenerate the data every evaluation (commented out near the bottom). Here is the first output:

BenchmarkTools.Trial: 123 samples with 1 evaluation.
 Range (min … max):  55.301 ms … 61.543 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     56.650 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   56.956 ms Β±  1.053 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

        β–‚β–‚β–‚ β–„ β–ˆβ–‚β–† β–„β–„β–†             β–„     β–†                      
  β–†β–β–„β–„β–†β–ˆβ–ˆβ–ˆβ–ˆβ–†β–ˆβ–β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–„β–β–ˆβ–†β–†β–„β–β–ˆβ–„β–ˆβ–ˆβ–†β–ˆβ–ˆβ–†β–„β–ˆβ–„β–ˆβ–ˆβ–„β–ˆβ–„β–„β–β–β–β–β–β–„β–„β–β–β–„β–β–ˆβ–β–β–β–„ β–„
  55.3 ms         Histogram: frequency by time        59.5 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

You’ll note that it doesn’t overlap all the previous histograms, so it doesn’t solve the problem.

There is one small const global variable used, so perhaps changing the address of that is what yields the change.

UPDATE: I realised that some of the data in my MWE changes each run. I made a minor change to fix this, and confirmed that I still see the effect we’re discussing.

4 Likes

I commented out the (optional) global variable (lines 35 & 36), and re-ran the test. I didn’t see much difference between consecutive benchmark results for several attempts, but after one re-start of Julia I got the following:

Benchmark results
BenchmarkTools.Trial: 112 samples with 1 evaluation.
 Range (min … max):  43.892 ms …  48.813 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     44.514 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   44.722 ms Β± 798.045 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

     β–ƒ  β–‚β–ˆβ–‚                                                     
  β–†β–„β–ˆβ–ˆβ–ˆβ–†β–ˆβ–ˆβ–ˆβ–†β–ˆβ–†β–„β–†β–…β–…β–ƒβ–β–ƒβ–β–„β–β–ƒβ–„β–β–ƒβ–„β–β–β–ƒβ–β–β–β–β–β–β–β–β–ƒβ–β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒ β–ƒ
  43.9 ms         Histogram: frequency by time         48.7 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 94 samples with 1 evaluation.
 Range (min … max):  52.438 ms …  55.515 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     53.203 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   53.384 ms Β± 743.769 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–…β–… β–…β–ˆβ–…    β–ˆ   β–‚β–…β–‚  β–‚  β–‚β–‚  β–‚         β–… β–‚                       
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–…β–ˆβ–β–ˆβ–ˆβ–ˆβ–…β–β–ˆβ–…β–…β–ˆβ–ˆβ–…β–ˆβ–ˆβ–ˆβ–ˆβ–…β–ˆβ–β–…β–β–β–…β–ˆβ–β–ˆβ–…β–…β–β–β–β–…β–β–…β–β–…β–…β–β–…β–β–β–β–β–ˆβ–β–β–… ▁
  52.4 ms         Histogram: frequency by time         55.3 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 113 samples with 1 evaluation.
 Range (min … max):  43.648 ms …  46.271 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     44.223 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   44.304 ms Β± 423.759 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

           β–‚    β–…β–‚β–‚  β–ƒβ–ˆ                                         
  β–ƒβ–β–β–β–ƒβ–ƒβ–…β–‡β–†β–ˆβ–†β–ˆβ–†β–…β–ˆβ–ˆβ–ˆβ–…β–…β–ˆβ–ˆβ–…β–†β–†β–†β–…β–†β–ƒβ–ƒβ–β–ƒβ–β–ƒβ–β–β–ƒβ–β–ƒβ–ƒβ–β–β–…β–ƒβ–ƒβ–†β–β–β–β–…β–ƒβ–…β–ƒβ–β–β–β–ƒβ–β–β–β–ƒ β–ƒ
  43.6 ms         Histogram: frequency by time         45.5 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 112 samples with 1 evaluation.
 Range (min … max):  43.599 ms … 52.458 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     44.552 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   44.837 ms Β±  1.136 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

      β–…β–‚β–‡β–‚β–ˆβ–ƒ                                                   
  β–„β–…β–ƒβ–†β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–…β–†β–ˆβ–†β–ƒβ–„β–†β–ƒβ–ƒβ–β–β–„β–ƒβ–ƒβ–ƒβ–ƒβ–†β–β–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒ β–ƒ
  43.6 ms         Histogram: frequency by time        50.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

BenchmarkTools.Trial: 109 samples with 1 evaluation.
 Range (min … max):  45.446 ms …  47.930 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     46.087 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   46.221 ms Β± 548.686 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

    β–ƒ  β–β–ˆ β–ƒβ–ˆβ–ˆβ– β–ˆβ– β–ƒβ–ƒ  β–ƒ β–ƒ  ▁    ▃▁                              
  β–„β–‡β–ˆβ–‡β–‡β–ˆβ–ˆβ–„β–ˆβ–ˆβ–ˆβ–ˆβ–„β–ˆβ–ˆβ–„β–ˆβ–ˆβ–„β–β–ˆβ–‡β–ˆβ–‡β–„β–ˆβ–„β–‡β–‡β–‡β–ˆβ–ˆβ–„β–„β–„β–‡β–„β–‡β–β–β–β–„β–β–β–„β–β–β–„β–β–„β–‡β–„β–„β–β–„β–β–β–β–β–„ β–„
  45.4 ms         Histogram: frequency by time         47.7 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

This shows that it’s not a necessary factor.

According to the research, things that change memory layout are routine. One thing is that UNIX loads its environment before the code and stack, so just tweaking environmental variables in some innocuous way (like changing the working directory because you’re a person with a different name from your colleague) can have an effect. How things are cached, where functions are addressed due to link order, the architecture, many things that are usually out of your control can tweak performance, and very small changes can just as easily reverse the effects.

Whether that effect is large enough to care about is something else. It’s possible for a very small program to exhibit 33% performance regression on average and 300% in a worst case while incrementing the environment size (Mytkowicz et al. 2009), but other benchmarks with varying environment size or link order had smaller single-digit-percentage impacts. A later review/partial replication (de Oliveira and Petkovich 2014) found yet smaller effects in their benchmarks.
Producing Wrong Data Without Doing Anything Obviously Wrong! (2009)
How Much Does Memory Layout Impact Performance? A Wide Study (2014)

The case here is notably different because the source code isn’t edited between benchmarks, and nothing external, e.g. the environment, is being manually tweaked. The only manual change is running the source file again, which in my understanding does recompile the setup redefinitions and benchmark. From the MWE, the benchmark part mostly forwards to NLLSsolver.costgradhess!, which so far seems like an impure function, but at least the setup of the re-ran source file looks like it’s doing the same thing each time (seeded prior to same number of rand()). data = MyData() is likely allocating in different places, and if that allocation alone causes the sometimes significant variation between benchmarks, then I’m guessing that allocating between samples should eliminate the observed effect like how Stabilizer randomizes heap locations:
@benchmark evalfunc(data) setup=(data = MyData()). The functions in the source file being redefined between benchmarks could also affect memory layout, so only re-running the benchmark lines should remove that possible effect. Those small experiments only address possible memory layout effects however, the magnitude of benchmark variations are consistent with OS jitter and thermal effects too, which are harder to control.
STABILIZER: Statistically Sound Performance Evaluation (2013)

5 Likes

I would conclude this from the thread:

  • benchmark timings may fluctuate even when no GC or allocations are involved. the inter-run-variability can be much larger than the inter-sample variability.
  • the benchmark output of a single run gives no indication whether that should be expected, nor can it because benchmarktools has no knowledge of what the machine is doing otherwise.
  • only interleaved measurements of the codes you want to compare can give a higher degree of certainty that you’re measuring a real difference, and not some random machine state. not sure if benchmarktools currently offers that at all. if not, that would be a good PR.
  • for a single code, only measuring more and more can increase your certainty how it behaves. there’s probably not much inherent to the benchmarktools code that could be improved to decrease machine-dependent variability over a couple hundred samples within a couple seconds of time.
  • you can’t just β€œmake [measurements] statistically meaningful”, unless there’s a timing bug, benchmarktools just reports what happens. it’s up to the user to interpret that. the data may be misleading in terms of long-term variability.
2 Likes

It hasn’t been easy to find an exact definition of a pure function, but from reading the docs, I understand it applies when a function is consistent, effect free, and terminates globally, as defined in this section.

The function NLLSsolver.costgradhess! is not effect free, since it mutates the first input, which is heap allocated. So it is indeed impure. My understanding now is that the ! on the end of a function name implies this.

What is this assertion based on? If it’s simply based on the amount of variation that OS jitter and thermal effects can cause (reference?), does this mean it explains the fact that two consecutive histograms are completely different? I.e. what is the timescale of these effects? Microseconds, or seconds? Isn’t one of the reasons for doing multiple evaluations within a single benchmark to overcome these effects? Are you suggesting it in fact doesn’t?

If the effect of these variations lasts, as you appear to suggest, for a number of seconds (and also magically transitions exactly between benchmarks), then I would expect to see a much wider range of times for a longer benchmark than a short benchmark, on very simple, single threaded, no malloc’ing (like my function), arithmetic test. Let’s try that:

Code
using StaticArrays, LinearAlgebra, BenchmarkTools

function testfunc(A)
    B = zeros(SMatrix{16, 16, Float64, 256}) + I
    for a = 1:32000
        B = B * A
    end
    return B
end

function myfunc()
    A = randn(SMatrix{16, 16, Float64, 256})
    b = @benchmarkable testfunc($A)
    tune!(b)
    display(run(b, seconds=5))
    display(run(b, seconds=500))
end

myfunc()
Result
BenchmarkTools.Trial: 105 samples with 1 evaluation.
 Range (min … max):  47.796 ms … 48.473 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     47.877 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   47.899 ms Β± 95.718 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

      ▁ β–ˆ  ▄▁                                                  
  β–…β–ƒβ–‡β–‡β–ˆβ–†β–ˆβ–ˆβ–†β–ˆβ–ˆβ–‡β–„β–„β–„β–„β–β–ƒβ–β–„β–ƒβ–…β–…β–ƒβ–β–ƒβ–β–β–β–β–ƒβ–β–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒ β–ƒ
  47.8 ms         Histogram: frequency by time        48.4 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.
BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range (min … max):  47.737 ms …  69.168 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     47.854 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   47.925 ms Β± 384.009 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

     β–„β–ˆβ–…β–‚                                                       
  β–‚β–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–…β–„β–„β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–‚β–‚β–‚β–‚ β–ƒ
  47.7 ms         Histogram: frequency by time           49 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

The fact that this result is consistent over a longer period (several minutes - far longer than the period over which I saw my differences) appears to reject the hypothesis that the effects I saw with my code could be caused by OS jitter or thermal effects.

Agree with this.

Disagree with all these. Not sure how you reached any of these conclusions from reading this thread, to be honest.

  1. There is zero evidence that suggests the effects I’m seeing are related to anything else that the machine is doing. I’m claiming it’s due to changes in memory layout of my Julia process - that is all.
  2. Even interleaved measurements of two codes is not useful. The variability comes from changes in memory layout. Interleaving code doesn’t change memory layout.
  3. More and more samples does not lead to better benchmarking. My results show that. Accounting for different memory layouts does lead to better, more actionable, benchmarks. The literature explains that. It doesn’t require more samples, but it requires a more advanced benchmarking tool that jumbles up memory locations of the data and also the code.
  4. You can make measurements statistically meaningful. The literature tells us this.
1 Like

It’s not a concept exclusive to Julia, and there isn’t just 1 definition in universal use, it’s more like 2 main usages and a whole lot more nuances, hence why @assume_effects is so complicated. But at the very least, mutation is universally considered impure computation. We probably don’t have to care about this directly, moreso the fact the heap is involved.

OS jitter and overheating affecting performance are very common phenomena, and they can last anywhere up to hours (though you probably don’t deserve a personal computer if you let it nearly melt). Feel free to doubt that they are involved, but I personally have witnessed benchmarks slowing down on occasion for those reasons. Your latest experiment is too short and uncontrolled to address those, and it’s not economical to mistreat your personal machine to really find out. Best thing to do is to make sure you’re not intentionally running any other process (like a video game) and try to turn off automatic processes (like antivirus scans) during benchmark periods, and even then keep an eye on how hot your machine gets.

1 Like

The results of my first poll are in. Thank you to all 12 voters. :heart:

Now that I’ve provided a MWE that several of you insisted was necessary, I’d love to take another poll. Answers gratefully received. :pray: This will help validate the importance of an MWE in this particular case.

  • I have looked at the code of the MWE
  • I have run the MWE myself
  • The MWE didn’t work for me
  • I saw no discernible change in the benchmark over multiple runs
  • I saw the change in the benchmark that was reported
  • None of the above
  • I believed a MWE was necessary
  • I believe this MWE could help improve BenchmarkTools
0 voters

Annedoctal note: I’ve once put my laptop in a freezer and observed that computations became measurably faster.

3 Likes

Prior to the most recent one that I posted, I ran a different benchmark that was multi-threaded, and made my machine run very hot. The fan came on, noisily. It’s very easy to tell when my machine is hot. There was a much longer, but very shallow, tail of slower runs, but the mean and median runtimes were very consistent with a shorter run of the same benchmark, that didn’t allow the processor to get hot.

Note that the results of the benchmark I did provide are very similar to my actual test case: single threaded, no memory allocation, pure computation, runs for about 40ms. In neither this case or in my test case does the fan come on at all.

But this misses the point I was making. You assert that OS jitter and overheating affecting performance could produce the difference I’m seeing, i.e. it’s magnitude and pattern. What is this based on?

Thanks. Likewise, feel free to believe they are involved. However, I cannot believe that the particular histograms of timings I’m seeing could possibly be caused by the things you’re suggesting.

My latest experiment is far longer that the period over which I see the results I’m raising. So if it’s too short, then why would these factors impact the results I’m reporting?

I’m not running any other process like games or video decoding, I don’t have anti-virus software, and my machine never gets hot, i.e. the fan doesn’t come on.

@Benny I also did not put my laptop in a freezer between runs. Please assume good faith and that I am being sensible in my attempts to fairly benchmark my code.

One more thing. The change in performance doesn’t follow a pattern. Faster slower, or slower then faster. Long runs the same, or just one run between changes. If it were the processor getting hot then I’d expect a consistent pattern.

1 Like

I ran your benchmark 20 times here, with:

julia> using StatsPlots 

julia> plt = boxplot()
       for i in 1:20
           data = MyData()
           b = @benchmark evalfunc($data)
           boxplot!(plt, [i], b.times, label=nothing)
           display(plt)
       end
       boxplot!(plt, xticks=(1:20, 1:20))

And got this:

Within trials 10 and 14 I got bored and played a short youtube video.

This is another run, whithout the yotube in the middle:

But at the begining I was responding here. It seems that, at least for me, huge fluctuations can come from any use of the computer resources (From trial ~15 on I just watched things appearing)

7 Likes

Thanks. This isn’t exactly my test case, because I re-ran all the code in the file. Also best not to do anything else when running the tests. I wasn’t.

What CPU do you have?

Isn’t the variability just that there are so few samples?

Looking at first a normal run, with 5 seconds (the default), and then a run with 300 seconds, I get

julia> @benchmark evalfunc(data) setup=(data=MyData())
BenchmarkTools.Trial: 21 samples with 1 evaluation.
 Range (min … max):  69.339 ms … 90.937 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     71.689 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   73.860 ms Β±  5.615 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

      β–β–ˆβ–ˆ
  β–†β–†β–β–†β–ˆβ–ˆβ–ˆβ–†β–†β–†β–β–β–†β–β–β–β–β–β–β–β–β–†β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–†β–β–β–β–β–β–β–β–†β–β–β–β–β–β–β–β–β–β–β–β–β–β–† ▁
  69.3 ms         Histogram: frequency by time        90.9 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark evalfunc(data) setup=(data=MyData())
BenchmarkTools.Trial: 22 samples with 1 evaluation.
 Range (min … max):  68.930 ms … 74.168 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     69.903 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   70.086 ms Β±  1.153 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  ▁▁ β–ˆβ–ˆβ–β–β– β–β–β–ˆβ–β–β–ˆ ▁  β–ˆ                ▁                     ▁
  β–ˆβ–ˆβ–β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–β–ˆβ–β–β–ˆβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ˆβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ˆ ▁
  68.9 ms         Histogram: frequency by time        74.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark evalfunc(data) setup=(data=MyData()) seconds=300
BenchmarkTools.Trial: 1278 samples with 1 evaluation.
 Range (min … max):  68.510 ms … 107.098 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     70.759 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   71.938 ms Β±   3.600 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

   β–β–…β–„β–ˆβ–…β–„β–
  β–…β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–‡β–†β–…β–…β–„β–ƒβ–ƒβ–„β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–ƒβ–ƒβ–‚β–‚β–‚β–‚β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–β–‚β–β–β–‚β–‚β–β–β–‚ β–ƒ
  68.5 ms         Histogram: frequency by time         87.3 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark evalfunc(data) setup=(data=MyData()) seconds=300
BenchmarkTools.Trial: 1296 samples with 1 evaluation.
 Range (min … max):  68.209 ms … 93.074 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     70.227 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   70.922 ms Β±  2.515 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

    β–β–ƒβ–†β–‡β–ˆβ–†β–…β–†β–ƒβ–„
  β–‚β–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–‡β–†β–…β–…β–…β–…β–„β–ƒβ–ƒβ–ƒβ–‚β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–‚β–ƒβ–‚β–‚β–ƒβ–‚β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–β–‚β–‚β–‚β–‚β–β–‚β–‚ β–ƒ
  68.2 ms         Histogram: frequency by time        81.8 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

The distributions for 300 seconds seems similar enough to me, and I imagine that sampling 20 samples from either could generate the data we see for the shorter runs.

I’m not sure if this is an argument for either side here, just another observation.

With this I tried to make a simple function with a loop and some math,

function f(N)
    t = 0
    for i in 1:N
        t += sin(t)+exp(1/(t+1))
    end
    return t
end

not allocating anything and no memory management from my side at least, running with 5 and 300 seconds.

julia> @benchmark f(N) setup=(N=3000000)
BenchmarkTools.Trial: 77 samples with 1 evaluation.
 Range (min … max):  64.624 ms …  67.246 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     64.881 ms               β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   64.988 ms Β± 449.306 ΞΌs  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆ β–ˆβ– ▁▅▄
  β–ˆβ–†β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–†β–β–…β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–…β–β–ƒβ–…β–ƒβ–ƒβ–ƒβ–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–ƒ ▁
  64.6 ms         Histogram: frequency by time         67.2 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark f(N) setup=(N=3000000)
BenchmarkTools.Trial: 75 samples with 1 evaluation.
 Range (min … max):  64.611 ms … 75.665 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     66.164 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   66.887 ms Β±  2.346 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–†β–„β–ˆ β–„β–‚β–‚β–‚ β–‚ β–‚
  β–ˆβ–ˆβ–ˆβ–„β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–„β–ˆβ–†β–„β–ˆβ–„β–„β–ˆβ–„β–„β–†β–„β–„β–„β–†β–β–β–β–„β–„β–„β–β–β–β–β–β–β–β–„β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–„β–β–β–β–„ ▁
  64.6 ms         Histogram: frequency by time        75.6 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark f(N) setup=(N=3000000) seconds=300
BenchmarkTools.Trial: 4525 samples with 1 evaluation.
 Range (min … max):  64.546 ms … 86.066 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     65.825 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   66.288 ms Β±  1.736 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–ˆβ–‚ β–β–‚β–†β–ˆβ–‡β–ƒβ–‚
  β–ˆβ–ˆβ–†β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–ˆβ–ˆβ–‡β–†β–‡β–†β–…β–…β–„β–„β–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β–β– β–ƒ
  64.5 ms         Histogram: frequency by time        73.5 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark f(N) setup=(N=300000) seconds=300
BenchmarkTools.Trial: 4577 samples with 1 evaluation.
 Range (min … max):  64.545 ms … 90.790 ms  β”Š GC (min … max): 0.00% … 0.00%
 Time  (median):     65.101 ms              β”Š GC (median):    0.00%
 Time  (mean Β± Οƒ):   65.531 ms Β±  1.306 ms  β”Š GC (mean Β± Οƒ):  0.00% Β± 0.00%

  β–…β–ˆβ–‡β–ƒβ–‚β–
  β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‡β–†β–…β–…β–…β–…β–…β–…β–…β–„β–„β–ƒβ–„β–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–ƒβ–‚β–ƒβ–ƒβ–‚β–ƒβ–‚β–‚β–ƒβ–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–‚β–β–‚β–‚β–β–β–‚ β–ƒ
  64.5 ms         Histogram: frequency by time        70.1 ms <

 Memory estimate: 0 bytes, allocs estimate: 0.

I think here we might actually see a larger discrepancy for the long running ones, but maybe similar enough to your case that it could be the same underlying reason? Though I’m not sure if this could be the problem you argue for or if it could be OS jitter/thermal throttling.

1 Like

On more relevant topics:

  • Have you recorded enough @benchmark results to make a distribution of the various timing statistics? It’d be interesting to see the modes like how environment size affected a small benchmark by Mytkowicz et al. In the simplest case with 2 modes you could roughly make a confidence interval for frequency of one or the other.
  • Were there also records for benchmarking with setup=(data = MyData()), in effect randomizing where the input is allocated?
  • Has Emery Berger gotten back to you yet about benchmarks or adapting Stabilizer to Julia?

This is now Γ¬nclude`ing the code:

julia> plt = boxplot()
       for i in 1:20
           include("./user.jl")
           b = @benchmark evalfunc($data)
           boxplot!(plt, [i], b.times, label=nothing)
           display(plt)
       end
       boxplot!(plt, xticks=(1:20, 1:20))

(the last line of the script, @benchmark... is commented.

And here is the same but adding a 10s interval between benchmarks:

julia> plt = boxplot()
       for i in 1:20
           include("./user.jl")
           b = @benchmark evalfunc($data)
           boxplot!(plt, [i], b.times, label=nothing)
           display(plt)
           sleep(10)
       end
       boxplot!(plt, xticks=(1:20, 1:20))

My CPU is:

julia> versioninfo()
Julia Version 1.10.0-beta2
Commit a468aa198d0 (2023-08-17 06:27 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 Γ— Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, skylake)
  Threads: 1 on 8 virtual cores
Environment:
  JULIA_EDITOR = vim
2 Likes

OK, so Intel - different architecture. I believe you’re very unlikely to see anything similar to what I’m seeing.

Here’s what I get when just running the code:

And when reloading the code:

I was not doing anything else on my machine during these runs.

1 Like

It may well be that you will not see the differences I’m seeing. What’s your CPU?

Second run with code re-loading:

Again, nothing else going on on my machine.

1 Like