Below the output of --track-allocation. The first time when the code is ran with a single thread, the second time when the work is divided and the code is called from 8 threads. Is this output to be trusted? If so where do all the extra allocations come from? In looking at the Windows memory monitor, wild variations in memory consumption can be seen, suggesting there might be something to these profiling outputs…
single-thread
        - function assemblechunk_body!(biop,
        -         test_shapes, test_elements, test_assembly_data,
        -         trial_shapes, trial_elements, trial_assembly_data,
        -         qd, zlocal, store; quadstrat)
        - 
        0     myid = Threads.threadid()
       48     myid == 1 && print("dots out of 10: ")
        0     todo, done, pctg = length(test_elements), 0, 0
        0     for (p,tcell) in enumerate(test_elements)
        0         for (q,bcell) in enumerate(trial_elements)
        - 
        0         fill!(zlocal, 0)
        0         qrule = quadrule(biop, test_shapes, trial_shapes, p, tcell, q, bcell, qd, quadstrat)
 52569600         momintegrals!(biop, test_shapes, trial_shapes, tcell, bcell, zlocal, qrule)
        0         I = length(test_assembly_data[p])
        0         J = length(trial_assembly_data[q])
        0         for j in 1 : J, i in 1 : I
        0             zij = zlocal[i,j]
        0             for (n,b) in trial_assembly_data[q][j]
        0                 zb = zij*b
        0                 for (m,a) in test_assembly_data[p][i]
        0                     store(a*zb, m, n)
        -         end end end end
        - 
        0         done += 1
        0         new_pctg = round(Int, done / todo * 100)
        0         if new_pctg > pctg + 9
      480             myid == 1 && print(".")
        0             pctg = new_pctg
        -     end end
      144     myid == 1 && println("")
        - end
multi-threading
        - function assemblechunk_body!(biop,
        -         test_shapes, test_elements, test_assembly_data,
        -         trial_shapes, trial_elements, trial_assembly_data,
        -         qd, zlocal, store; quadstrat)
        - 
      192     myid = Threads.threadid()
        0     myid == 1 && print("dots out of 10: ")
        0     todo, done, pctg = length(test_elements), 0, 0
      496     for (p,tcell) in enumerate(test_elements)
    43136         for (q,bcell) in enumerate(trial_elements)
        - 
  4491664         fill!(zlocal, 0)
    47040         qrule = quadrule(biop, test_shapes, trial_shapes, p, tcell, q, bcell, qd, quadstrat)
 23514928         momintegrals!(biop, test_shapes, trial_shapes, tcell, bcell, zlocal, qrule)
    23392         I = length(test_assembly_data[p])
    26208         J = length(trial_assembly_data[q])
   678336         for j in 1 : J, i in 1 : I
   105616             zij = zlocal[i,j]
  2133792             for (n,b) in trial_assembly_data[q][j]
   407968                 zb = zij*b
  1984048                 for (m,a) in test_assembly_data[p][i]
  3907536                     store(a*zb, m, n)
        -         end end end end
        - 
        0         done += 1
        0         new_pctg = round(Int, done / todo * 100)
        0         if new_pctg > pctg + 9
        0             myid == 1 && print(".")
      832             pctg = new_pctg
        -     end end
        0     myid == 1 && println("")
        - end
PS: the many allocations on the line with momintegrals! are to be expected. I explicitly rely on dynamic dispatch to choose the appropriate implementation of this function, based on the type of qrule.