Intermediate value type instability

Hello, I have something like:

function goofy(num_of_bits::Integer,...)
    if num_of_bits > 64
        tmp=zero(UInt128)
    else
        tmp=zero(UInt64)
    end
    ...
end

which if num_of_bits is <= 64 runs quite faster than a type stable 128 bit version of itself but 30% slower than a type stable 64 bit one.

Should I wrap the two type stable version into an adapter (which have to pass trough all the other parameters) ?

Are there other elegant solutions?

Not sure what you mean by β€œadapter” here but if you have a (unavoidable) type instability you should generally isolate it as much as possible and use function barriers to separate it from performance critical parts (i.e. wrap the computation that happens after your if branch in a function which itself will be type stable).

1 Like

Unfortunately the type instability plagues almost all of the function body and, as this is an β€œinner loop” function, performance is critical.

by adapter I mean something like

function goofy(nb:Integer, ...)
    if nb > 64
        goofy128(nb, ...)
    else
        goofy64(nb, ...)
    end
end

where goofy64 and goofy128 are identical but for the type of tmp

This is code duplication and I find it ugly.

Ok, I realize now that I can spare the function call and duplicate the code inside the function, slightly faster but, if possible, uglier.

The decision whether the computation should use 128 or 64 bit uints should happen before the critical loop (if possible).

In any case the computation itself should live in a separate function. You then call this function with different argument types in the branches, e.g. Vector{UInt128} or Vector{UInt64} respectively. Julia will then automatically specialize for the different argument types (e.g. the different tmps). No need to define two identical functions with different names.

(I’m on the phone and can’t provide an example right now.)

1 Like

Perhaps this should be

goofy(num_of_bits:Integer, args...) = goofy(num_of_bits > 64 ? UInt128 : UInt64, args...)
function goofy(::Type{T}, args...) where {T} # actual expensive function
end
3 Likes

Here is the function

function sendints!(buffer::BitBuffer, num_of_bits::R, sizes::MVector{3, UInt32}, nums::MVector{3, UInt32}) where {R <: Integer}
    if num_of_bits > 64
        result = UInt128(nums[1])
    else
        result = UInt64(nums[1])
    end
    #result = UInt128(nums[1])
    result *= sizes[2]
    result += nums[2]
    result *= sizes[3]
    result += nums[3]

    avail_in_first_byte = mod(8 - buffer.offset, 8)
    freebits = sizeof(result) * 8 - num_of_bits

    # send first bits
    if buffer.offset > 0  
        firstbits = ((result & (0xFF << buffer.offset)) % UInt8) >>> buffer.offset
        buffer.bits[buffer.index] |= firstbits
        buffer.index += 1
        num_of_bits -= avail_in_first_byte
    end

    # first swap needed to smoothly shift between bytes
    result = bswap(result)

    # this correction is needed to get rid of leading zeoroes 
    # that the bitswap sneaked in between
    shiftbytes = cld(freebits, 8)
    shiftbits = rem(freebits, 8)   
    rest = typemax(typeof(result)) << (8 * shiftbytes)
    part = ~ rest
    value = (result & part) << shiftbits
    result = (result & rest) | value

    # this is the motivation for the first bitswap
    result <<= avail_in_first_byte

    # second swap needed to facilitate sending bytes in little endian 
    result = bswap(result)

    num_of_bytes = cld(num_of_bits, 8)
    for _ in 1:num_of_bytes
        byte = result % UInt8
        buffer.bits[buffer.index] = byte
        buffer.index += 1
        result >>>= 8
    end

    buffer.offset = mod(num_of_bits, 8)
    if buffer.offset > 0 
        buffer.index -= 1 # last byte is not complete
    end
    return nothing
end

as you can see the type of result (terrible name as it is just a temporary value) has effects almost everywhere

I don’t measure any speed advantage, I guess it is because the β€œactual expensive” function still has an undecided type dependency

Have you tried something like

function sendints!(buffer num_of_bits, sizes, nums)
    sendints!(buffer, num_of_bits, sizes, nums, num_of_bits > 64 ? UInt128(nums[1]): UInt64(nums[1]))
end
function sendints!(buffer, num_of_bits, sizes, nums, result) # rest of the computation
end
function sendints!(buffer::BitBuffer, num_of_bits::Integer, sizes::MVector{3, UInt32}, nums::MVector{3, UInt32})
    sendints_inner(buffer, num_of_bits, sizes, nums, num_of_bits > 64 ? UInt128(nums[1]) : UInt64(nums[1]))
end

function sendints_inner(buffer::BitBuffer, num_of_bits::Integer, sizes::MVector{3, UInt32}, nums::MVector{3, UInt32}, result::T) where {T <: Unsigned} 

even slower

Could you post a full MWE so that we may look at type-instabilities? Perhaps there are more that need to be resolved

The function could rather be defined like

function sendints!(buffer, resulttype::Type{T}, sizes, nums) where T
    num_of_bits = 8*sizeof(T)
    ...
end

and be called as

r = sendints!(buffer, UInt64, sizes, nums)

or

r = sendints!(buffer, UInt128, sizes, nums)

as per @code_warntype (don’t know what a MWE is) everything is fine (no reds!)

MethodInstance for sendints_inner(::BitBuffer, ::Int64, ::MVector{3, UInt32}, ::MVector{3, UInt32}, ::UInt128)
  from sendints_inner(buffer::BitBuffer, num_of_bits::Integer, sizes::MVector{3, UInt32}, nums::MVector{3, UInt32}, result::T) where T<:Unsigned in Main at /Users/uliano/files/bitbuffer.jl:181
Static Parameters
  T = UInt128
Arguments
  #self#::Core.Const(sendints_inner)
  buffer::BitBuffer
  num_of_bits@_3::Int64
  sizes::MVector{3, UInt32}
  nums::MVector{3, UInt32}
  result@_6::UInt128
Locals
  @_7::Union{Nothing, Tuple{Int64, Int64}}
  num_of_bytes::Int64
  value::UInt128
  part::UInt128
  rest::UInt128
  shiftbits::Int64
  shiftbytes::Int64
  firstbits::UInt8
  freebits::Int64
  avail_in_first_byte::Int64
  byte::UInt8
  num_of_bits@_18::Int64
  result@_19::UInt128
Body::Nothing
1 ─       (result@_19 = result@_6)
β”‚         (num_of_bits@_18 = num_of_bits@_3)
β”‚         Core.NewvarNode(:(@_7))
β”‚         Core.NewvarNode(:(num_of_bytes))
β”‚         Core.NewvarNode(:(value))
β”‚         Core.NewvarNode(:(part))
β”‚         Core.NewvarNode(:(rest))
β”‚         Core.NewvarNode(:(shiftbits))
β”‚         Core.NewvarNode(:(shiftbytes))
β”‚         Core.NewvarNode(:(firstbits))
β”‚   %11 = result@_19::UInt128
β”‚   %12 = Base.getindex(sizes, 2)::UInt32
β”‚         (result@_19 = %11 * %12)
β”‚   %14 = result@_19::UInt128
β”‚   %15 = Base.getindex(nums, 2)::UInt32
β”‚         (result@_19 = %14 + %15)
β”‚   %17 = result@_19::UInt128
β”‚   %18 = Base.getindex(sizes, 3)::UInt32
β”‚         (result@_19 = %17 * %18)
β”‚   %20 = result@_19::UInt128
β”‚   %21 = Base.getindex(nums, 3)::UInt32
β”‚         (result@_19 = %20 + %21)
β”‚   %23 = Base.getproperty(buffer, :offset)::Int64
β”‚   %24 = (8 - %23)::Int64
β”‚         (avail_in_first_byte = Main.mod(%24, 8))
β”‚   %26 = Main.sizeof(result@_19)::Core.Const(16)
β”‚   %27 = (%26 * 8)::Core.Const(128)
β”‚         (freebits = %27 - num_of_bits@_18)
β”‚   %29 = Base.getproperty(buffer, :offset)::Int64
β”‚   %30 = (%29 > 0)::Bool
└──       goto #3 if not %30
2 ─ %32 = result@_19::UInt128
β”‚   %33 = Base.getproperty(buffer, :offset)::Int64
β”‚   %34 = (0xff << %33)::UInt8
β”‚   %35 = (%32 & %34)::UInt128
β”‚   %36 = (%35 % Main.UInt8)::UInt8
β”‚   %37 = Base.getproperty(buffer, :offset)::Int64
β”‚         (firstbits = %36 >>> %37)
β”‚   %39 = Base.getproperty(buffer, :bits)::Vector{UInt8}
β”‚   %40 = Base.getproperty(buffer, :index)::Int64
β”‚   %41 = Base.getindex(%39, %40)::UInt8
β”‚   %42 = (%41 | firstbits)::UInt8
β”‚   %43 = Base.getproperty(buffer, :bits)::Vector{UInt8}
β”‚   %44 = Base.getproperty(buffer, :index)::Int64
β”‚         Base.setindex!(%43, %42, %44)
β”‚   %46 = Base.getproperty(buffer, :index)::Int64
β”‚   %47 = (%46 + 1)::Int64
β”‚         Base.setproperty!(buffer, :index, %47)
└──       (num_of_bits@_18 = num_of_bits@_18 - avail_in_first_byte)
3 β”„       (result@_19 = Main.bswap(result@_19))
β”‚         (shiftbytes = Main.cld(freebits, 8))
β”‚         (shiftbits = Main.rem(freebits, 8))
β”‚   %53 = Main.typeof(result@_19)::Core.Const(UInt128)
β”‚   %54 = Main.typemax(%53)::Core.Const(0xffffffffffffffffffffffffffffffff)
β”‚   %55 = (8 * shiftbytes)::Int64
β”‚         (rest = %54 << %55)
β”‚         (part = ~rest)
β”‚   %58 = (result@_19 & part)::UInt128
β”‚         (value = %58 << shiftbits)
β”‚   %60 = (result@_19 & rest)::UInt128
β”‚         (result@_19 = %60 | value)
β”‚         (result@_19 = result@_19 << avail_in_first_byte)
β”‚         (result@_19 = Main.bswap(result@_19))
β”‚         (num_of_bytes = Main.cld(num_of_bits@_18, 8))
β”‚   %65 = (1:num_of_bytes)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
β”‚         (@_7 = Base.iterate(%65))
β”‚   %67 = (@_7 === nothing)::Bool
β”‚   %68 = Base.not_int(%67)::Bool
└──       goto #6 if not %68
4 β”„ %70 = @_7::Tuple{Int64, Int64}
β”‚         Core.getfield(%70, 1)
β”‚   %72 = Core.getfield(%70, 2)::Int64
β”‚         (byte = result@_19 % Main.UInt8)
β”‚   %74 = Base.getproperty(buffer, :bits)::Vector{UInt8}
β”‚   %75 = byte::UInt8
β”‚   %76 = Base.getproperty(buffer, :index)::Int64
β”‚         Base.setindex!(%74, %75, %76)
β”‚   %78 = Base.getproperty(buffer, :index)::Int64
β”‚   %79 = (%78 + 1)::Int64
β”‚         Base.setproperty!(buffer, :index, %79)
β”‚         (result@_19 = result@_19 >>> 8)
β”‚         (@_7 = Base.iterate(%65, %72))
β”‚   %83 = (@_7 === nothing)::Bool
β”‚   %84 = Base.not_int(%83)::Bool
└──       goto #6 if not %84
5 ─       goto #4
6 β”„ %87 = Main.mod(num_of_bits@_18, 8)::Int64
β”‚         Base.setproperty!(buffer, :offset, %87)
β”‚   %89 = Base.getproperty(buffer, :offset)::Int64
β”‚   %90 = (%89 > 0)::Bool
└──       goto #8 if not %90
7 ─ %92 = Base.getproperty(buffer, :index)::Int64
β”‚   %93 = (%92 - 1)::Int64
└──       Base.setproperty!(buffer, :index, %93)
8 β”„       return Main.nothing

num_of_bits is an input parameter (how many bits to send to the buffer after the β€˜compression’ has happened) and may be any number between 72 and, let me see… 9.

Then

function sendints!(buffer, num_of_bits, resulttype{T}, sizes, nums) where T
    ...
end

would still isolate the loop into a single function and separate it from the decision which result type to use, which generally is preferable.

I have no result, this function has only side effects (on buffer which is type stable)

function sendints!(buffer, num_of_bits, ::Type{T}, sizes, nums) where T
    result = T(num[1])
    ...
end

should work and be type stable? E.g.

sendints!(buffer, 65, UInt128, sizes, nums)

I’ve come up with the idea that, in this particular case, the quest for speed has to be payed in terms of, as ugly as it may be, code duplication (I have also had to rename four variables to reach type stability)

function sendints!(buffer::BitBuffer, num_of_bits::Integer, sizes::MVector{3, UInt32}, nums::MVector{3, UInt32}) 
    if num_of_bits > 64
        result = UInt128(nums[1])
        result *= sizes[2]
        result += nums[2]
        result *= sizes[3]
        result += nums[3]

        avail_in_first_byte = mod(8 - buffer.offset, 8)
        freebits = sizeof(result) * 8 - num_of_bits

        # send first bits
        if buffer.offset > 0  
            firstbits = ((result & (0xFF << buffer.offset)) % UInt8) >>> buffer.offset
            buffer.bits[buffer.index] |= firstbits
            buffer.index += 1
            num_of_bits -= avail_in_first_byte
        end

        # first swap needed to smoothly shift between bytes
        result = bswap(result)

        # this correction is needed to get rid of leading zeoroes 
        # that the bitswap sneaked in between
        shiftbytes = cld(freebits, 8)
        shiftbits = rem(freebits, 8)   
        rest = typemax(typeof(result)) << (8 * shiftbytes)
        part = ~ rest
        value = (result & part) << shiftbits
        result = (result & rest) | value

        # this is the motivation for the first bitswap
        result <<= avail_in_first_byte

        # second swap needed to facilitate sending bytes in little endian 
        result = bswap(result)

        num_of_bytes = cld(num_of_bits, 8)
        for _ in 1:num_of_bytes
            byte = result % UInt8
            buffer.bits[buffer.index] = byte
            buffer.index += 1
            result >>>= 8
        end

    else
        result64 = UInt64(nums[1])
        result64 *= sizes[2]
        result64 += nums[2]
        result64 *= sizes[3]
        result64 += nums[3]

        avail_in_first_byte = mod(8 - buffer.offset, 8)
        freebits = sizeof(result64) * 8 - num_of_bits

        # send first bits
        if buffer.offset > 0  
            firstbits = ((result64 & (0xFF << buffer.offset)) % UInt8) >>> buffer.offset
            buffer.bits[buffer.index] |= firstbits
            buffer.index += 1
            num_of_bits -= avail_in_first_byte
        end

        # first swap needed to smoothly shift between bytes
        result64 = bswap(result64)

        # this correction is needed to get rid of leading zeoroes 
        # that the bitswap sneaked in between
        shiftbytes = cld(freebits, 8)
        shiftbits = rem(freebits, 8)   
        rest64 = typemax(typeof(result64)) << (8 * shiftbytes)
        part64 = ~ rest64
        value64 = (result64 & part64) << shiftbits
        result64 = (result64 & rest64) | value64

        # this is the motivation for the first bitswap
        result64 <<= avail_in_first_byte

        # second swap needed to facilitate sending bytes in little endian 
        result64 = bswap(result64)

        num_of_bytes = cld(num_of_bits, 8)
        for _ in 1:num_of_bytes
            byte = result64 % UInt8
            buffer.bits[buffer.index] = byte
            buffer.index += 1
            result64 >>>= 8
        end
    end
    buffer.offset = mod(num_of_bits, 8)
    if buffer.offset > 0 
        buffer.index -= 1 # last byte is not complete
    end
    return nothing
end