I am trying to understand how to minimize allocations in PythonCall. Here, I am using stim, a python library, to generate samples (stored in numpy arrays). I have two implementations:
function generate_samples(;circ_string, num_samples=1, batch_seeds_ranges=[1:100, 150:200] )
PythonCall.@pyexec (circ_string, num_samples, batch_seeds_ranges) => """
import numpy as np
import stim
# Set up the circuit and sampler
circ_stim = stim.Circuit(circ_string)
nshots= sum(len(seeds) for seeds in batch_seeds_ranges) * num_samples
ndets = circ_stim.num_detectors
nobs = circ_stim.num_observables
det_shots_np = np.zeros(shape=(nshots, (ndets + 7) // 8 ),dtype=np.uint8)
obs_shots_np = np.zeros(shape=(nshots, (nobs + 7) // 8),dtype=np.uint8)
counter = 0
for seeds in batch_seeds_ranges:
for idx,seed in enumerate(seeds):
sampler = circ_stim.compile_detector_sampler(seed=seed)
sampler.sample(num_samples, separate_observables = True, bit_packed=True, dets_out=det_shots_np[counter:counter+num_samples], obs_out=obs_shots_np[counter:counter+num_samples])
counter += num_samples
""" => (det_shots_np, obs_shots_np, ndets, nobs);
return det_shots_np, obs_shots_np, ndets, nobs
end
function generate_samples_wrapper(circ_string, num_samples = 1, batch_seeds_ranges=[1:100, 150:200])
circ_stim = stim.Circuit(circ_string)
ndets = circ_stim.num_detectors
nobs = circ_stim.num_observables
nshots= sum(length.(batch_seeds_ranges)) * num_samples
det_shots_np = np.zeros(shape=(nshots, Int(ceil(pyconvert(Int, ndets)/ 8)) ),dtype=np.uint8)
obs_shots_np = np.zeros(shape=(nshots, Int(ceil(pyconvert(Int, nobs)/ 8))),dtype=np.uint8)
counter = 0
for seeds in batch_seeds_ranges
for (idx,seed) in enumerate(seeds)
sampler = circ_stim.compile_detector_sampler(seed=seed)
sampler.sample(num_samples, separate_observables = true, bit_packed=true, dets_out= det_shots_np[counter: counter+num_samples-1], obs_out= obs_shots_np[counter:counter+num_samples-1])
PythonCall.Core.pydel!(sampler)
counter += num_samples
end
end
return det_shots_np, obs_shots_np, ndets, nobs
end
julia> @time det_shots_np, obs_shots_np, _, nobs= generate_samples(;circ_string);
0.130245 seconds (156 allocations: 3.430 KiB)
julia> @time det_shots_np, obs_shots_np, _, nobs= generate_samples_wrapper(circ_string);
0.140203 seconds (8.38 k allocations: 178.477 KiB)
I do not understand why the second implementation has such a higher allocations count. Any insight would be appreciated.
Using the first implementation would be nice, but unfortunately I run into errors when I try to run it on multiple workers.