The function case
in my application runs a bash script (using run
) and takes as input the gpu id to run the function on. Now I have many cases with different parameters in the X
vector. What I want is to schedule the jobs on the number of gpus I have (in my case 2) and monitor which job is done and then submit another job to that gpu so that the gpus are running 24/7 until all jobs are done. I can easily write a script that does this such that after all gpus are done a new batch of parameters are run, but since all the cases do not take the same amount of time to run I’d like to monitor which jobs are done and submit new cases in order to save time since each case is expensive.
Below is my attempt and it doesn’t seem to work.
# sample case function, important that it takes a gpu_id
function case(x::Real; gpu_id=0)
sleeptime = rand([1 10])
sleep(sleeptime)
println("id $gpu_id, sleep time $sleeptime")
nothing
end
function runcases(X::AbstractArray{Float64,1}; num_gpus::Integer=2)
n = size(X,1)
gpuids = [0:num_gpus-1...] # gpu ids are numbered starting from 0
queue = Vector{Task}(num_gpus)
counter = 0
nextidx() = (idx=counter; counter+=1; idx)
idx = nextidx()
@sync begin
for i = 1:num_gpus
idx = nextidx()
queue[i] = @async case(X[idx,1], gpu_id=gpuids[i])
end
yield()
while idx <= n
for i = 1:num_gpus
if istaskdone(queue[i])
idx = nextidx()
if idx > n
break
end
queue[i] = @async case(X[idx,1], gpu_id=gpuids[i])
end
yield()
end
end
end
println("Done")
nothing
end
X = randn(10)
runcases(X,num_gpus=2)