How to parse a long string when regex fails

I will focus on the nested part, the first two numbers you can do with a regex. For large amounts of data, a finite state machine would work:

function parse_nested_vectors(str)
    result = Vector{Vector{Int}}()
    level = 0
    num = nothing
    v = Vector{Int}()
    function _finish_num()
        if num ≢ nothing
            push!(v, num)
            num = nothing
        end
    end
    for c in str
        if c == '['
            level += 1
            if level == 2
                v = Vector{Int}()
            elseif level > 2
                error("not supported")
            end
        elseif c == ']'
            _finish_num()
            level == 2 && push!(result, v)
            level -= 1
        elseif c == ','
            level == 2 && _finish_num()
        elseif isdigit(c)
            @assert level == 2
            num = 10 * something(num, 0) + parse(Int, c)
        else
            @assert c == ' '    # the only other thing that can happen
        end
    end
    @assert level == 0
    result
end

str = "[[1, 127, 50, 50], [115], [249, 50, 8, 257]]" # MWE
julia> parse_nested_vectors(str)
3-element Array{Array{Int64,1},1}:
 [1, 127, 50, 50]
 [115]
 [249, 50, 8, 257]

If you need negative integers or floats, I would recommend one of the parser packages for parsing the relevant bits.

1 Like