f(x) = sin(x)cos(x)tan(x)exp(x)sin(x)cos(x)tan(x)exp(x)
X = randn(100_000_000)
@time f.(X) # 4 seconds
@time @threads for x in X; f(x) end # .8 seconds 16 threads
@time @turbo f.(X) # .4 seconds
@time f.(ROCArray(X)) # .1 seconds AMD GPU
@threads uses the whole CPU but @turbo is faster. Is there a way to get the speed up of @turbo plus use the whole CPU ?
GPU is super fast but ends up being slower if you include the time to move to the GPU and back to CPU.
julia> f(x) = sin(x)cos(x)tan(x)exp(x)sin(x)cos(x)tan(x)exp(x)
f (generic function with 1 method)
julia> X = randn(100_000_000);
julia> Y = similar(X);
julia> function farr!(y, x)
y .= f.(x)
return nothing
end
farr! (generic function with 1 method)
julia> function farr_th!(y, x)
Threads.@threads for i in eachindex(x)
@inbounds y[i] = f(x[i])
end
return nothing
end
farr_th! (generic function with 1 method)
julia> xc = CuArray(X);
julia> yc = CuArray(Y);
julia> @time farr!(Y, X) #Broadcast in CPU
6.336066 seconds (68.31 k allocations: 3.622 MiB, 0.36% compilation time)
julia> @time farr!(Y, X) #Broadcast in CPU
7.044405 seconds
julia> @time farr_th!(Y, X) #4 Threads in CPU
2.102914 seconds (35.83 k allocations: 1.827 MiB, 5.03% compilation time)
julia> @time farr_th!(Y, X) #4 Threads in CPU
2.172290 seconds (22 allocations: 2.156 KiB)
julia> @time farr!(yc, xc) #Broadcast in Cuda GPU
0.198106 seconds (69.00 k allocations: 4.755 MiB, 8.38% compilation time)
julia> @time farr!(yc, xc) #Broadcast in Cuda GPU
0.000071 seconds (42 allocations: 736 bytes)
Though @turbo is faster than plain @threads
julia> function farr_turbo!(y, x)
@turbo y .= f.(x)
return nothing
end
farr_turbo! (generic function with 1 method)
julia> @time farr_turbo!(Y, X) #Broadcast in CPU turbo
2.754020 seconds (3.65 M allocations: 175.448 MiB, 1.31% gc time, 79.22% compilation time)
julia> @time farr_turbo!(Y, X) #Broadcast in CPU turbo
0.602359 seconds
GPU test with move
julia> @time farr!(CuArray(Y), CuArray(X)) #Broadcast in Cuda GPU and include allocation
1.474983 seconds (1.76 M allocations: 94.184 MiB, 75.76% compilation time: 24% of which was recompilation)
julia> @time farr!(CuArray(Y), CuArray(X)) #Broadcast in Cuda GPU and include allocation
0.369701 seconds (62 allocations: 1.281 KiB, 8.23% gc time)
Better you test with CUDA.@time farr!(yc, xc). Or best even @btime CUDA.@sync farr!($yc, $xc). It looks like 71µs is too fast and merely measures the kernel launch.
For those kind of operations, also try Tullio.jl. It works with threads and without LoopVectorization.jl (if you load both, Tullio.jl will also use LV!).
Also, LoopVectorization.@tturbo does threading, fastmath, SIMD and some cache optimization. So it’s hard to beat naively.
As far as I know
It will multiply by anything after a close bracket ) except an open bracket which will be treated like a function. And you can put a number before anything. 10(1+1)sin(1)