I run roflmaostc code
using CUDA, BenchmarkTools
function math1!(C, A, B)
inner(A, B) = A^2 + B^2 + A * B + A / B - A * B - A / B + A * B + A / B - A * B - A / B
C .= inner.(A, B)
return C
end
function math2!(D, C)
inner(C) = C^2 + C^2 + C * C + C / C - C * C - C / C + C * C + C / C - C * C - C / C
D .= inner.(C)
return D
end
function math3!(E, D)
inner(D) = D^2 + D^2 + D * D + D / D - D * D - D / D + D * D + D / D - D * D - D / D
E .= inner.(D)
return E
end
function f()
A = CUDA.rand(151,151,151) .+ 1;
B = CUDA.rand(151,151,151) .+ 1;
C = CUDA.zeros(151,151,151) .+ 1;
D = similar(C);
E = similar(C);
F = similar(C);
@btime CUDA.@sync begin for iter = 1:1000
math1!($C, $A, $B)
math2!($D, $C)
math3!($E, $D)
end
sum($E)
end
end
f()