How to parse a long string when regex fails

I have strings like:

Automaton(“det”,266,4,[[1, 127, 50, 50, 115, 249, 50, 8, 257, 10, 151, 12, 13, 14, 81, 78, 34, 106, 137, 107, 21, 22, 82, 89, 71, 43, 43, 62, 63, 63, 44, 10, 179, 171, 214, 265, 206, 38, 137, 152, 21, 151, 71, 22, 41, 41, 47, 13, 49, 50, 50, 210, 50, 210, 116, 90, 64, 49, 207, 207, 207, 62, 232, 64, 64, 89, 151, 236, 236, 70, 71, 236, 116, 260, 128, 75, 13, 38, 77, 133, 14, 82, 13, 13, 77, 235, 64, 64, 231, 206, 138, 91, 91, 206, 90, 264, 137, 169, 10, 10, 41, 71, 232, 228, 232, 21, 21, 107, 106, 110, 111, 266, 256, 114, 114, 116, 251, 251, 116, 114, 114, 111, 266, 260, 122, 182, 127, 128, 129, 213, 132, 132, 129, 49, 49, 138, 138, 138, 249, 71, 43, 43, 210, 235, 183, 214, 132, 265, 206, 138, 232, 64, 64, 153, 153, 64, 152, 152, 82, 236, 236, 82, 116, 116, 64, 207, 266, 64, 231, 169, 171, 250, 227, 70, 175, 175, 176, 10, 47, 178, 178, 110, 111, 266, 260, 122, 266, 127, 213, 49, 49, 152, 257, 153, 257, 90, 90, 183, 250, 264, 62, 228, 229, 82, 82, 206, 64, 266, 249, 249, 249, 242, 213, 214, 64, 116, 114, 114, 50, 50, 236, 236, 261, 115, 152, 251, 8, 228, 229, 229, 231, 232, 237, 235, 235, 236, 237, 235, 235, 235, 235, 235, 235, 242, 242, 242, 115, 251, 249, 250, 116, 116, 116, 207, 64, 265, 266, 257, 256, 266, 264, 261, 260, 264, 265, 266], [1, 125, 1, 114, 114, 121, 115, 3, 23, 19, 13, 15, 230, 28, 201, 202, 16, 84, 123, 103, 105, 208, 114, 84, 172, 199, 199, 119, 83, 83, 123, 170, 16, 31, 27, 26, 25, 104, 102, 23, 28, 13, 120, 208, 48, 103, 19, 230, 239, 1, 114, 218, 115, 219, 82, 140, 23, 208, 204, 216, 216, 119, 174, 119, 119, 84, 13, 115, 115, 50, 120, 50, 82, 204, 262, 79, 230, 104, 203, 85, 28, 114, 230, 230, 203, 3, 23, 119, 96, 96, 96, 123, 102, 140, 208, 3, 200, 200, 98, 97, 103, 120, 119, 120, 119, 28, 105, 103, 105, 54, 3, 23, 141, 1, 250, 50, 82, 114, 50, 1, 250, 3, 3, 219, 219, 125, 54, 263, 124, 124, 27, 124, 223, 208, 239, 25, 208, 208, 120, 120, 217, 217, 217, 120, 120, 142, 142, 141, 140, 140, 174, 174, 174, 204, 216, 23, 119, 23, 250, 114, 114, 250, 162, 162, 159, 205, 159, 159, 208, 123, 170, 1, 219, 50, 208, 208, 123, 177, 177, 19, 170, 173, 172, 172, 199, 199, 23, 186, 185, 184, 184, 23, 120, 216, 23, 140, 208, 120, 1, 3, 119, 120, 114, 114, 250, 239, 119, 3, 121, 3, 120, 219, 212, 212, 23, 50, 1, 250, 1, 114, 50, 114, 219, 114, 23, 114, 3, 120, 114, 114, 208, 119, 125, 121, 3, 50, 54, 120, 3, 120, 121, 3, 120, 218, 219, 217, 114, 114, 3, 1, 250, 250, 250, 253, 252, 173, 172, 120, 125, 3, 3, 219, 219, 3, 54, 3], [1, 1, 249, 249, 6, 1, 1, 247, 7, 130, 167, 17, 11, 66, 33, 33, 131, 136, 10, 32, 150, 9, 112, 36, 149, 112, 149, 56, 167, 37, 181, 130, 35, 147, 139, 51, 139, 11, 10, 234, 147, 37, 148, 113, 136, 136, 146, 24, 249, 1, 1, 249, 249, 249, 139, 6, 238, 127, 134, 238, 134, 9, 148, 238, 134, 167, 36, 50, 249, 247, 73, 249, 51, 139, 112, 167, 76, 76, 11, 167, 80, 73, 67, 66, 66, 135, 58, 58, 112, 139, 233, 100, 100, 2, 6, 95, 10, 32, 130, 130, 99, 94, 94, 56, 149, 150, 147, 150, 136, 51, 51, 51, 7, 51, 51, 51, 7, 6, 139, 139, 139, 2, 58, 134, 139, 7, 1, 112, 112, 249, 238, 238, 112, 249, 127, 238, 233, 238, 1, 149, 112, 149, 249, 135, 7, 139, 238, 51, 139, 238, 112, 238, 233, 234, 234, 134, 234, 190, 112, 50, 249, 73, 139, 51, 238, 134, 51, 134, 148, 32, 147, 145, 145, 145, 258, 259, 180, 130, 146, 189, 189, 51, 51, 134, 134, 139, 2, 1, 249, 127, 249, 191, 191, 191, 188, 188, 188, 188, 198, 197, 196, 196, 195, 187, 187, 139, 233, 134, 127, 127, 127, 49, 249, 139, 243, 211, 211, 211, 210, 210, 210, 210, 95, 209, 241, 209, 126, 117, 117, 9, 112, 112, 50, 50, 50, 50, 50, 50, 49, 49, 127, 127, 127, 249, 249, 249, 7, 7, 1, 247, 51, 139, 211, 134, 134, 51, 51, 7, 7, 2, 126, 145, 139, 247, 51, 51], [1, 51, 3, 4, 4, 7, 7, 50, 73, 61, 11, 12, 40, 109, 101, 93, 45, 18, 19, 20, 21, 64, 161, 29, 193, 118, 193, 65, 11, 42, 19, 59, 30, 46, 163, 164, 163, 40, 39, 57, 46, 42, 246, 60, 18, 21, 61, 158, 72, 50, 51, 53, 53, 3, 55, 55, 57, 246, 156, 64, 65, 64, 155, 64, 65, 11, 29, 68, 69, 236, 160, 72, 73, 55, 157, 11, 154, 154, 40, 11, 46, 160, 225, 192, 192, 86, 87, 88, 157, 118, 155, 19, 39, 74, 161, 239, 92, 92, 166, 254, 20, 144, 88, 240, 65, 109, 108, 21, 21, 50, 50, 73, 73, 50, 247, 236, 73, 161, 72, 3, 5, 54, 86, 239, 3, 51, 50, 64, 64, 161, 165, 64, 157, 161, 245, 165, 60, 64, 51, 240, 161, 240, 4, 144, 51, 55, 57, 73, 55, 57, 157, 157, 155, 57, 64, 156, 64, 87, 118, 160, 161, 248, 163, 164, 165, 168, 164, 168, 60, 19, 108, 3, 3, 72, 64, 60, 19, 194, 194, 61, 59, 247, 247, 193, 193, 5, 74, 7, 69, 244, 69, 156, 240, 65, 74, 74, 246, 143, 219, 86, 88, 144, 222, 222, 226, 72, 60, 239, 52, 54, 143, 239, 72, 72, 215, 221, 219, 224, 219, 220, 221, 222, 239, 220, 215, 222, 54, 160, 160, 161, 64, 64, 160, 68, 236, 236, 236, 160, 239, 240, 244, 245, 246, 69, 72, 161, 51, 160, 50, 50, 248, 118, 226, 255, 255, 248, 248, 160, 160, 245, 245, 72, 72, 236, 236, 236]],[12],[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266])

For shorter versions of this string I am using

 m = match(r"(\d+).*(\d+).*(\[\[.*\]\])", line)

to extract the first two numbers and the first array of arrays.

However for strings of the length above and longer this fails with:

PCRE.exec error: match limit exceeded

How can I get hold of the two numbers and the first Array of Arrays in strings this long and much longer?

Well, if your string has just a vector of vector and nothing more surprising you can try to parse it manually:

> s = your_long_string
> fidx = findfirst('[', s)
> lidx = findfirst(']', s)
> s2 = s[fidx:lidx]
> s3 = replace(s2, ['[', ']', ','] => "")
> a = split(s3)
> b = parse.(Int, a)

Alternativelly, seem like you could define a Automaton struct for which the result of @show is the same as this string you have, and let eval do the dirty work for you:

> struct Automaton; a :: String; b :: Int; c :: Int; d :: Vector{Vector{Int}}; end
> b = eval(Meta.parse("Automaton(\"det\", 5, 6, [[1, 2, 3], [4, 5, 6]])")

I don’t think this works. Take this shorter example:

Automaton(“det”,2,4,[[1, 2], [2, 0], [2, 0], [1, 2]],[1],[1, 2])

This code

m = match(r"(\d+).*(\d+).*(\[\[.*\]\])", s)

gives me “2”, “4” and “[[1, 2], [2, 0], [2, 0], [1, 2]]” which I can then parse. I would like to do the same thing for longer examples.

I think this variant of your solution works:

m = match(r"(\d+).*(\d+)", line)
val = parse(Int32, m.captures[1])
fidx = findfirst("[[", line)
lidx = findfirst("]]", line)
expr = Meta.parse(line[fidx[1]:lidx[2]])
arrs = Vector{Int32}.(getproperty.(expr.args, :args))

I also want to do this for huge strings where this will be slow/memory expensive but I will ask a separate question if I can’t get that to work.

Ah, I understood that you wanted the first inner vector not the vector of vectors. My second solution does not work then. You need to change to not get the first ‘]’ character but another separator.

This updated version of the code below does not give what you want in the fields a, b, and c of the returned Automaton object?

> struct Automaton; a :: String; b :: Int; c :: Int; d :: Vector{Vector{Int}}; e :: Vector{Int}; f :: Vector{Int}; end
> b = eval(Meta.parse("Automaton(\"det\",2,4,[[1, 2], [2, 0], [2, 0], [1, 2]],[1],[1, 2])"))
1 Like

I will focus on the nested part, the first two numbers you can do with a regex. For large amounts of data, a finite state machine would work:

function parse_nested_vectors(str)
    result = Vector{Vector{Int}}()
    level = 0
    num = nothing
    v = Vector{Int}()
    function _finish_num()
        if num ≢ nothing
            push!(v, num)
            num = nothing
        end
    end
    for c in str
        if c == '['
            level += 1
            if level == 2
                v = Vector{Int}()
            elseif level > 2
                error("not supported")
            end
        elseif c == ']'
            _finish_num()
            level == 2 && push!(result, v)
            level -= 1
        elseif c == ','
            level == 2 && _finish_num()
        elseif isdigit(c)
            @assert level == 2
            num = 10 * something(num, 0) + parse(Int, c)
        else
            @assert c == ' '    # the only other thing that can happen
        end
    end
    @assert level == 0
    result
end

str = "[[1, 127, 50, 50], [115], [249, 50, 8, 257]]" # MWE
julia> parse_nested_vectors(str)
3-element Array{Array{Int64,1},1}:
 [1, 127, 50, 50]
 [115]
 [249, 50, 8, 257]

If you need negative integers or floats, I would recommend one of the parser packages for parsing the relevant bits.

1 Like

Thank you so much! This looks like a really useful answer.

I have got this code running for me now but it is slower than I expected. I get between 450k and 700k characters processed per second. Is there something obvious I could check to see if it can be sped up?

I did not really optimize anything in that code. You could start checking the things in the performance tips, but frankly, I would just let it finish and save the data in a format that does not need to be parsed.

Effectively you are dealing with a ragged array: just transform into a flat vector and save the index boundaries.