Profiling memory allocations

I have written some code in functional/immutable style, with the aim of not allocating any memory (on the heap). The code is weird because this is a mock-up of something I then want to create a macro to generate from simpler code (working toward a rewriting of EspyInsideFunction.jl. Arguably you do not need to study the code to help me, but for reproducibility here it is:

#using MacroTools
function material(request,z)
    out0 = (;)
    s = sum(z)
    out1 = (out0...,s= haskey(request,:s) ? s : nothing)
    s
    σ = s^2
    χ = 3.
    return σ, χ,out1
end

function residual(request,x,y)
    out0 = (;)
    ngp = 4
    key_gp = haskey(request, :gp) ? request.gp : nothing
    out_gp0 = (;) 
    accum = ntuple(ngp) do igp
        z = x[igp] + y[igp]
        if haskey(key_gp, :material)
            σ, χgp,out_material = material(key_gp.material, z)
            out_gp1             = (out_gp0..., material= out_material)
        else
            σ, χgp              = material(                 z)
            out_gp1             = out_gp0
        end
        σ, χgp
        out_gp2 = haskey(key_gp, :σ       ) ? (out_gp1..., σ       = σ           ) : out_gp1 
        σ
        (σ=σ, χ=χgp,out=out_gp2) 
    end # do igp
    r = sum(        accum[igp].σ for igp=1:ngp)
    χ = NTuple{ngp}(accum[igp].χ for igp=1:ngp)
    out1       = haskey(request   , :gp     ) ? (out0...    , gp      = NTuple{ngp}(accum[igp].out for igp=1:ngp)) : out0
    return r,χ,out1
end   

I did warn you, it’s weird stuff. I then profile memory usage:

using StaticArrays
using Profile,ProfileView,BenchmarkTools,PProf
request = (gp = (material = (s = nothing,), σ = nothing),)

residual(request,SVector(1.,2,3,4),SVector(1.,1,1,1))# don't profile compilation...
Profile.Allocs.clear()
Profile.Allocs.@profile sample_rate=1 for i = 1:999 residual(request,SVector(1.,2,3,4),SVector(1.,1,1,1))
end
PProf.Allocs.pprof(from_c = false)

All tools agree, one allocation per call. But I struggle to interpret where this occurs.

Here is the “source code” view in PProf ( I made small modifications to avoid red typesetting of single or triple quotes).

Alloc: Profile.Allocs.UnknownType
nothing

  Total:         999        999 (flat, cum)   100%
      0          999        999           <instructions with unknown line numbers> 
(::VSCodeServer.var\ #107#109\ {Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
c:\Users\philippem\.vscode\extensions\julialang.language-julia-1.38.2\scripts\packages\VSCodeServer\src\repl.jl

  Total:           0        999 (flat, cum)   100%
    181            .          .                   catch err 
    182            .          .                       @debug "Could not send repl/starteval notification" exception = (err, catch_backtrace()) 
    183            .          .                   end 
    184            .          .                   r = run_with_backend() do 
    185            .          .                       fix_displays(; is_repl = true) 
    186            .        999                       f = () -> repleval(m, line, REPL.repl_filename(repl, main_mode.hist)) 
(::VSCodeServer.var\#106#108\{Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
c:\Users\philippem\.vscode\extensions\julialang.language-julia-1.38.2\scripts\packages\VSCodeServer\src\repl.jl

  Total:           0        999 (flat, cum)   100%
    187            .        999                       PROGRESS_ENABLED[] ? Logging.with_logger(f, VSCodeLogger()) : f() 
    188            .          .                   end 
    189            .          .                   REPL_PROMPT_STATE[] = REPLPromptStates.Error 
    190            .          .                   if r isa EvalError 
    191            .          .                       display_repl_error(stderr, r.err, r.bt) 
    192            .          .                       nothing 
repleval(::Module, ::Expr, ::String)
c:\Users\philippem\.vscode\extensions\julialang.language-julia-1.38.2\scripts\packages\VSCodeServer\src\repl.jl

  Total:           0        999 (flat, cum)   100%
    217            .          .               args = VERSION >= v"1.5" ? (REPL.softscope, m, code, file) : (m, code, file) 
    218            .          .               return include_string(args...) 
    219            .          .           end 
    220            .          .            
    221            .          .           @noinline function repleval(m, code, _) 
    222            .        999               return Base.eval(m, code) 
    223            .          .           end 
    224            .          .            
    225            .          .           # basically the same as Base's `display_error`, with internal frames removed 
    226            .          .           function display_repl_error(io, err, bt) 
    227            .          .               st = stacktrace(crop_backtrace(bt)) 
(::VSCodeServer.var\#61#62\)()
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\task.jl

  Total:           0        999 (flat, cum)   100%
    479            .          .           # generate the code for @async, possibly wrapping the task in something before 
    480            .          .           # pushing it to the wait queue. 
    481            .          .           function do_async_macro(expr; wrap=identity) 
    482            .          .               letargs = Base._lift_one_interp!(expr) 
    483            .          .            
    484            .        999               thunk = esc(:(()->($expr))) 
    485            .          .               var = esc(sync_varname) 
    486            .          .               quote 
    487            .          .                   let $(letargs...) 
    488            .          .                       local task = Task($thunk) 
    489            .          .                       if $(Expr(:islocal, var)) 
invokelatest(::Any)
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\essentials.jl

  Total:           0        999 (flat, cum)   100%
    721            .          .           e.g. long-running event loops or callback functions that may 
    722            .          .           call obsolete versions of a function `f`. 
    723            .          .           (The drawback is that `invokelatest` is somewhat slower than calling 
    724            .          .           `f` directly, and the type of the result cannot be inferred by the compiler.) 
    725            .          .           "" 
    726            .        999           function invokelatest(@nospecialize(f), @nospecialize args...; kwargs...) 
    727            .          .               kwargs = merge(NamedTuple(), kwargs) 
#invokelatest#2
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\essentials.jl

  Total:           0        999 (flat, cum)   100%
    728            .          .               if isempty(kwargs) 
    729            .        999                   return Core._call_latest(f, args...) 
    730            .          .               end 
    731            .          .               return Core._call_latest(Core.kwfunc(f), kwargs, f, args...) 
    732            .          .           end 
eval
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\boot.jl

  Total:           0        999 (flat, cum)   100%
    363            .          .           # This should always be inlined 
    364            .          .           getptls() = ccall(:jl_get_ptls_states, Ptr{Cvoid}, ()) 
    365            .          .            
    366            .          .           include(m::Module, fname::String) = ccall(:jl_load_, Any, (Any, Any), m, fname) 
    367            .          .            
    368            .        999           eval(m::Module, @nospecialize(e)) = ccall(:jl_toplevel_eval_in, Any, (Any, Any), m, e) 
    369            .          .            
    370            .          .           kwfunc(@nospecialize(f)) = ccall(:jl_get_keyword_sorter, Any, (Any,), f) 
    371            .          .            
    372            .          .           kwftype(@nospecialize(t)) = typeof(ccall(:jl_get_kwsorter, Any, (Any,), t)) 
    373            .          .            
macro expansion
c:\Users\philippem\.vscode\extensions\julialang.language-julia-1.38.2\scripts\packages\VSCodeServer\src\eval.jl

  Total:           0        999 (flat, cum)   100%
     29            .          .                       try 
     30            .          .                           f, args = take!(EVAL_CHANNEL_IN) 
     31            .          .                           Base.sigatomic_end() 
     32            .          .                           IS_BACKEND_WORKING[] = true 
     33            .          .                           res = try 
     34            .        999                               Base.invokelatest(f, args...) 
     35            .          .                           catch err 
     36            .          .                               @static if isdefined(Base, :current_exceptions) 
     37            .          .                                   EvalErrorStack(Base.current_exceptions(current_task())) 
     38            .          .                               elseif isdefined(Base, :catch_stack) 
     39            .          .                                   EvalErrorStack(Base.catch_stack()) 
with_logstate(::Function, ::Any)
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\logging.jl

  Total:           0        999 (flat, cum)   100%
    506            .          .               @nospecialize 
    507            .          .               t = current_task() 
    508            .          .               old = t.logstate 
    509            .          .               try 
    510            .          .                   t.logstate = logstate 
    511            .        999                   f() 
    512            .          .               finally 
    513            .          .                   t.logstate = old 
    514            .          .               end 
    515            .          .           end 
    516            .          .            
with_logger
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\logging.jl

  Total:           0        999 (flat, cum)   100%
    618            .          .               test([1,2]) 
    619            .          .           end 
    620            .          .           `` 
    621            .          .           "" 
    622            .          .           function with_logger(@nospecialize(f::Function), logger::AbstractLogger) 
    623            .        999               with_logstate(f, LogState(logger)) 
    624            .          .           end 
    625            .          .            
    626            .          .           "" 
    627            .          .               current_logger() 
    628            .          .            
macro expansion
REPL[15]

  Total:           0        999 (flat, cum)   100%
      1            .        999           ??? 
top-level scope
C:\buildbot\worker\package_win64\build\usr\share\julia\stdlib\v1.8\Profile\src\Allocs.jl

  Total:           0        999 (flat, cum)   100%
     73            .        999           ??? 
eval
C:\Users\philippem\AppData\Local\Programs\Julia-1.8.2\share\julia\base\Base.jl

  Total:           0        999 (flat, cum)   100%
     60            .          .           convert(::Type{Any}, Core.@nospecialize x) = x 
     61            .          .           convert(::Type{T}, x::T) where {T} = x 
     62            .          .           include("coreio.jl") 
     63            .          .            
     64            .          .           eval(x) = Core.eval(Base, x) 
     65            .        999           eval(m::Module, x) = Core.eval(m, x) 
     66            .          .            
     67            .          .           # init core docsystem 
     68            .          .           import Core: @doc, @__doc__, WrappedException, @int128_str, @uint128_str, @big_str, @cmd 
     69            .          .           if isdefined(Core, :Compiler) 
     70            .          .               import Core.Compiler.CoreDocs 

The above never refers to my code (residual, material) but has something about macro expansion. What macro?

The flame graph is more compact. it has no branching, so I can dump it here as a column of strings, verbatim:

root
(::VSCodeServer.var\"#61#62\")()
macro expansion
invokelatest(::Any)
#invokelatest#2
(::VSCodeServer.var\"#63#67\"{VSCodeServer.ReplRunCodeRequestParams})()
with_logger
with_logstate(::Function, ::Any)
(::VSCodeServer.var\"#64#68\"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
hideprompt(::VSCodeServer.var\"#65#69\"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})
(::VSCodeServer.var\"#65#69\"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
withpath(::VSCodeServer.var\"#66#70\"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams}, ::String)
(::VSCodeServer.var\"#66#70\"{Bool, Bool, Bool, Module, String, Int64, Int64, String, VSCodeServer.ReplRunCodeRequestParams})()
inlineeval##kw
var\"#inlineeval#71\"(::Bool, ::typeof(VSCodeServer.inlineeval), ::Module, ::String, ::Int64, ::Int64, ::String)
invokelatest(::Any, ::Any, ::Vararg{Any})
var\"#invokelatest#2\"(::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, ::typeof(Base.invokelatest), ::Any, ::Any, ::Vararg{Any})
include_string(::Module, ::String, ::String)
include_string(::typeof(identity), ::Module, ::String, ::String)
eval
top-level scope
macro expansion
Alloc: Profile.Allocs.UnknownType

Any ideas?

Update.
Somehow, it’s the mere fact of passing the first argument (a nested structure of tuples) that triggers allocation and loss of performance in profiling.

So this is nothing within my functions, and I am begining to suspect an artifact in profiling - triggered in both Profile.profile and BenchmarkTools.@btime.

In fact, I have proof now:

function foo(x,y)
    request = (gp = (material = (s = nothing,), σ = nothing),)
    residual(request,x,y)
end
@btime foo(SVector(1.,2,3,4),SVector(1.,1,1,1))
 1.700 ns (0 allocations: 0 bytes)

which is how I like them.

Oh now, it’s the classic “interpolated your inputs when you benchmark”. I never learn, caught out again!!! :grin: