Sunday Small challenge

Hello Julians!
Welcome to the first Sunday Small challenge.
I will post 3 lil’ problems - whoever can solve them in Julia (in serial, all architectures allowed) on the input(s) repeated 1000x wins all the internet points. Each problem is it’s own category.
Challenge ends next Sunday.

  1. Remove HTML Tags
# Input - assume the string is ASCII
str = Vector{UInt8}("<div>Hello <b>JuliaCon2022!</b></div>")
# Output satisfies 
unhtml(str) == Vector{UInt8}("Hello JuliaCon2022!")
  1. Hamming distance - count the number of corresponding unequal elements in 2 ordered collections of equal size.
# input - assume ASCII
s1 = "AAABBB"; s2 = "AAACCC";
# output
hamming(s1, s2) == 3
  1. Max Parenthesis depth - count the deepest level of nesting for parenthesis.
# Input, assume ASCII
str = "(()()(((((())))()()()(((()))()()())))))()()"
# Output
depth(str) == 7
1 Like

Hey @miguelraz

I got a solution for number 2:

function hamming(x,y)
    @assert length(x) == length(y)
    hd = 0
    for i in 1:length(x)
        if x[i] != y[i]
            hd += 1
        end
    end
    return hd
end
1 Like

Here is a SIMD implementation taking advantage of AVX512.
Note! That means it requires AVX512 to be fast. It’ll probably be pretty slow otherwise.

It’s also probably suboptimal, but good enough for now

julia> cppstr = "<div>Hello <b>CppNorth!</b></div>";

julia> cppstrlong = cppstr^10;

julia> v = UInt8[];

julia> vunhtml!(v, cppstr);

julia> Base.unsafe_string(pointer(v), length(v))
"Hello CppNorth!"

julia> vunhtml!(v, cppstrlong);

julia> Base.unsafe_string(pointer(v), length(v))
"Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!Hello CppNorth!"

julia> @benchmark vunhtml!($v, $cppstr)
BenchmarkTools.Trial: 10000 samples with 996 evaluations.
 Range (min … max):  25.315 ns …  1.095 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     26.340 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   27.295 ns ± 11.404 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅██▇▅▂▁▁                                                    ▂
  ██████████▇▇▇▇▅▅▆▆▄▄▅▅▅▅▅▅▄▅▅▄▅▅▅▄▅▄▅▄▂▃▃▄▄▄▄▄▅▅▅▆▆▆▇▇▇▆▆▆▇ █
  25.3 ns      Histogram: log(frequency) by time      48.2 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark vunhtml!($v, $cppstrlong)
BenchmarkTools.Trial: 10000 samples with 955 evaluations.
 Range (min … max):  88.157 ns … 175.039 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     91.343 ns               ┊ GC (median):    0.00%
 Time  (mean ± σ):   93.188 ns ±   6.414 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

  ▅    ▆█ ▂▂ ▁▂▁▁                                              ▁
  █▆▅█▆████████████▇█▇▇▇▇▇▆▆▅▆▆▆▆▅▅▆▆▆▅▅▅▅▄▆▆▅▅▅▆▅▇▇▇▇▆▆▇▇▇▇▇▆ █
  88.2 ns       Histogram: log(frequency) by time       119 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Implementation:

using VectorizationBase

function vunhtml!(output::Vector{UInt8}, input::AbstractVector{UInt8})
  N = length(input)
  resize!(output, N)
  n = 0
  W = VectorizationBase.pick_vector_width(UInt8)
  GC.@preserve output input begin
    pinput = VectorizationBase.zstridedpointer(input)
    pout = pointer(output)
    i = 0
    while i < N
      m = VectorizationBase.mask(W, i, N)
      md = VectorizationBase.data(m)
      # @show i md
      v = vload(pinput, (MM{Int(W)}(i),), m)
      ml = v == UInt8('<')
      mu = v == UInt8('>')
      muu = VectorizationBase.data(mu)
      mlu = VectorizationBase.data(ml)
      i += 64
      m2 = VectorizationBase.Mask(m)
      if mlu > muu
        lz = leading_zeros(mlu)
        truncflag = (one(UInt) << (8sizeof(UInt) - 1 - lz))
        mlu -= truncflag
        m2 &= VectorizationBase.Mask{Int(W)}((truncflag-one(truncflag)))
        i -= 1 + lz
      end
      cmu = ~((muu - mlu) + muu)
      cm = VectorizationBase.Mask{Int(W)}(cmu) & m2
      cmd = VectorizationBase.data(cm)
      # @show muu mlu cmu 
      VectorizationBase.compressstore!(pout + n, v, cm)
      n += count_ones(cm)
    end
  end
  resize!(output, n)
  output
end
vunhtml!(output, input::String) = vunhtml!(output, codeunits(input))
4 Likes

For lovers of succinctness:

unhtml(str) = replace(String(str), r"</?\w+>" => "")

depth(str) = cumsum(get(Dict('(' => +1, ')' => -1), c, 0)
                    for c in str) |> maximum

hamming(s1, s2) = sum(collect(s1) .!= collect(s2))
5 Likes

Here’s a similar one for depth.

depth(str) = cumsum(2('(' - c) + 1 for c in str) |> maximum

1 Like

In case source string is a known const, you don’t need any regexes, the answer is already known. If it not exactly that and can be some other valid HTML:


unhtml(str) = replace(String(str), r"</?\w+>" => "")

s1 = "<div>Hello <b>JuliaCon2022!</b></div>"
s2 = """
<div id="header" class="container_10">Hello <b>JuliaCon2022!</b></div>
"""

s1v = Vector{UInt8}(s1)
s2v = Vector{UInt8}(s2)

julia> unhtml(s1v)
"Hello JuliaCon2022!"

julia> unhtml(s2v)
"<div id=\"header\" class=\"container_10\">Hello JuliaCon2022!\n"

BTW the last tag in the original post should probably be a closing one: </div>

1 Like

Here’s one for parens depth that doesn’t allocate, for a 10x speedup, and handles strings that aren’t just ‘(’ and ‘)’:

function pdepth(str)
    current = 0
    maxdepth = -1
    for c in str
        current += (c == '(') - (c == ')')
        maxdepth = max(maxdepth, current)
    end
    return maxdepth
end
2 Likes

How should the code handle leading )s? What is the correct value for

str = ")))(())()()("

or

str = "(((()"

?

2

hamming(x, y) = sum(Base.splat(!=), zip(x, y))

Thanks, I’ve fixed the closing tag, yes.

We can also obfuscate it :stuck_out_tongue:

depth(str) = foldl(((c, m), n) -> (c+n, max(m, c+n)), 
    (2('(' - c) + 1 for c in str); init = (0, 0))[2]