I think its better I write the full code here, so that its clear where I need the parallel computing. So I have been trying to write couple functions to do Bootstrap, here are the functions
Function to create the data
function create_data(N1::Int64, N0::Int64)
# I set tau = 5, there was nothing in the paper
Y_1 = Array{Int8}(fill(5, N1));
Y_0 = Array{Float32}(rand(Normal(0,1), N0))
# datasets
# treated data sets
X_t = Array{Float32}(rand(Uniform(0,1), N1))
treated = hcat(X_t, Y_1);
# control data sets
X_c = Array{Float32}(rand(Uniform(0,1), N0))
control = hcat(X_c, Y_0);
treated, control
end
The output of the above function could be placed as arguments in the following which would create a matched data set
function match_tc(treated_d, control_d)
matched_Y0 = Array{Float32}(undef, 0)
for i = 1:size(treated_d)[1]
x = [abs(treated_d[i, 1]-control_d[j, 1]) for j = 1:size(control_d)[1] ];
x_min = findmin(x)[2] #find the 1st minimum, assuming there is only one, 2 is for index
ith_control = control_d[x_min, 2]
push!(matched_Y0, ith_control)
end
matched = hcat(treated_d[:, 2], matched_Y0) # matched dataset
end
Then from the matched data we can do bootstrap using following function
function boot_ATET(B::Int64, treated_d, control_d, N1_b, N0_b)
ATET_b = Float32[] # storage for bootstrap estimates
for b in 1:B
i_t = sample(collect(1: size(treated_d)[1]), N1_b, replace = true)
i_c = sample(collect(1: size(control_d)[1]), N0_b, replace = true)
treated_d_b = treated_d[i_t, :]
control_d_b = control_d[i_c, :]
matched = match_tc(treated_d_b, control_d_b)
ATET = mean(matched[:, 1] - matched[:, 2])
push!(ATET_b, ATET)
end
ATET_b
end
And finally we can do Monte Carlo Simulation using replic
function, and this is where I wanted to use some parallel or distributed computing
function replic(R, N1, N0, B, N1_b, N0_b, seedn)
Random.seed!(seedn)
R_means = Array{Float32}(undef, 0)
R_bootmeans = Array{Float32}(undef, 0)
R_bootvars = Array{Float32}(undef, 0)
for i in 1:R
treated, control = create_data(N1, N0) # create the data
matched = match_tc(treated, control) # match and create matched data
ATET = mean(matched[:, 1] - matched[:, 2]) # estimate
push!(R_means, ATET)
# Bootstrap replication is 100
ATET_b = boot_ATET(B, treated, control, N1_b, N0_b)
boot_mean = mean(ATET_b)
push!(R_bootmeans, boot_mean)
boot_var = sum((ATET_b .- ATET).^2)/B
push!(R_bootvars, boot_var)
end
R_means, R_bootmeans, R_bootvars
end
so after defining you can just call this,
replic(10, 100, 100, 100, 100, 100, 3434);
to get the result for 10 replications, here some parallel would be really useful. This is probably one of the first complete Julia functions I have written, so I could also learn if someone has some suggestions. Thanks in advance