Reading the julia inv source code (https://github.com/JuliaLang/julia/blob/master/base/complex.jl),
it appears that Complex{Float32} inverse convert (widen) to Complex{Float64} and calls the corresponding specialized implementation. It does explain at least a part of the overhead.
I do not know enough to tell if this implementation choice is optimal or not.
inv(z::Complex{<:Union{Float16,Float32}}) =
oftype(z, inv(widen(z)))
/(z::Complex{T}, w::Complex{T}) where {T<:Union{Float16,Float32}} =
oftype(z, widen(z)*inv(widen(w)))
ComplexF64 specialization
# robust complex division for double precision
# variables are scaled & unscaled to avoid over/underflow, if necessary
# based on arxiv.1210.4539
# a + i*b
# p + i*q = ---------
# c + i*d
function /(z::ComplexF64, w::ComplexF64)
a, b = reim(z); c, d = reim(w)
absa = abs(a); absb = abs(b); ab = absa >= absb ? absa : absb # equiv. to max(abs(a),abs(b)) but without NaN-handling (faster)
absc = abs(c); absd = abs(d); cd = absc >= absd ? absc : absd
halfov = 0.5*floatmax(Float64) # overflow threshold
twounϵ = floatmin(Float64)*2.0/eps(Float64) # underflow threshold
# actual division operations
if ab>=halfov || ab<=twounϵ || cd>=halfov || cd<=twounϵ # over/underflow case
p,q = scaling_cdiv(a,b,c,d,ab,cd) # scales a,b,c,d before division (unscales after)
else
p,q = cdiv(a,b,c,d)
end
return ComplexF64(p,q)
end
This text will be hidden