Hello all, sorry for disturbing you again, looks like I have run into a similar problem again. I hope it’s not the same silly doubt as the last time.
Let’s define the variables first:
rickfg= fill(0 .+ im*0., nf, 1, 1, 1, 1, 1) |>gpu; # FFT of ricker
fg= fill(0 .+ im*0., nf, 1, 1,1,1,1)|>gpu; # fgrid
shiftg= fill(0 .+ im*0., 1, nr, nz, ny, nx, 1)|>gpu; # time shifts
Tshiftg= fill(0 .+ im*0., 1,1,1,1,1,nT)|>gpu; #time shits for temporal domain
G_vec_per_rec= fill(0. + im* 0., nf, 1, nz, ny, nx, nT) |>gpu;
G_unvec_per_rec= fill(0. + im* 0., nf, nz*ny*nx*nT) |>gpu;
delay= fill(0. + im.*0, 1, 1, nz, ny, nx, nT) |>gpu;
delay_rf= fill(0. + im* 0., nf, 1, nz, ny, nx, nT) |>gpu;
dunvec= fill(0. + im.*0, nf,nr) |>gpu;
munvec= fill(0. + im.*0, nz*ny*nx*nT, 1) |>gpu;
Later on, these are updated with values, to reproduce, you can fill them up with random complex numbers, there is no sparsity in any arrays here, and I am using Flux.jl
along with CUDA.jl
to run them on GPU, and took nr= 125, nz= 61, ny== 1, nx= 61, nf= 401, nT= 141
.
The following line of codes
@time broadcast!(+, delay, view(shiftg, :, ir:ir, :, :, :, :), Tshiftg);
@time broadcast!(*, delay_rf, -im, 2π, fg, delay);
@time broadcast!(exp, delay_rf, delay_rf);
@time broadcast!(*, G_vec_per_rec, rickfg, delay_rf);
@time copyto!(G_unvec_per_rec, G_vec_per_rec);
@time dd= view(dunvec, :, ir:ir)
# @time # mm= view(munvec, :,ir:ir)
@time mul!(dd, G_unvec_per_rec, munvec);
return
0.000101 seconds (51 allocations: 1.734 KiB)
0.000052 seconds (7 allocations: 512 bytes)
0.000072 seconds (26 allocations: 2.250 KiB)
0.000058 seconds (7 allocations: 512 bytes)
0.000053 seconds (2 allocations: 32 bytes)
0.000011 seconds (5 allocations: 176 bytes)
0.000139 seconds (17 allocations: 304 bytes)
In all, a total of approx. 0.000500 seconds.
but the following
for i in 1:10
@time for ir in 1:nr
broadcast!(+, delay, view(shiftg, :, ir:ir, :, :, :, :), Tshiftg);
broadcast!(*, delay_rf, -im, 2π, fg, delay);
broadcast!(exp, delay_rf, delay_rf);
broadcast!(*, G_vec_per_rec, rickfg, delay_rf);
copyto!(G_unvec_per_rec, G_vec_per_rec);
dd= view(dunvec, :, ir:ir)
# mm= view(munvec, :,ir:ir)
mul!(dd, G_unvec_per_rec, munvec);
end
end
returns
0.644182 seconds (14.27 k allocations: 678.453 KiB)
7.918085 seconds (14.27 k allocations: 678.453 KiB)
7.953713 seconds (14.27 k allocations: 678.453 KiB)
7.958340 seconds (14.27 k allocations: 678.453 KiB)
7.879235 seconds (14.27 k allocations: 678.453 KiB)
7.940350 seconds (14.27 k allocations: 678.453 KiB)
8.226722 seconds (14.27 k allocations: 678.453 KiB)
8.210767 seconds (14.27 k allocations: 678.453 KiB)
8.279914 seconds (14.27 k allocations: 678.453 KiB)
8.243146 seconds (14.27 k allocations: 678.453 KiB)
Things add up for the first runtime, 0.000500*nr(=125)*10= 0.625 seconds, but after that it messes up. I don’t think it’s the global variables thing this time, because I’m running them without defining any functions. Am I again missing something basic? Please let me know, and apologies if it’s again a simple thing.
I’m tagging you all so that you receive the notifications @goerch @Benny @maphdze @FrancisKing
Thanks for your time!