Reading a few rows from a BIG CSV file

Thanks for this - it does appear to be the correct file.

pulling it down and applying the arrow transform (which took 51.905011 seconds (218.61 M allocations: 10.079 GiB, 4.80% gc time, 39.06% compilation time: 10% of which was recompilation) - arrow conversion, 1 thread) :

using Arrow, CSV, DataFrames, BenchmarkTools

 file = raw"ONSPD_NOV_2023_UK.csv"
 convertedFile = raw"ONSPD_NOV_2023_UK.arrow"

 @time Arrow.write(convertedFile, CSV.File(file,header=true, normalizenames=true), ntasks=1)

function filterRows(arrowCol, f)
  
    rowIndex = 0
    rows = Vector{Int32}()
    for r in arrowCol
        rowIndex += 1
        if f(r)
            push!(rows, rowIndex)
        end
    end

    rows

end

function getArrowSubset(filterRows, selectedCols, dfp)
    df = DataFrame()
    for col in selectedCols
        dfCol = dfp[col]
        df[:, col] = dfCol[filterRows]
    end

df
end

function psamTest(psamfile)
    
    dfp = Arrow.Table(psamfile)
    filterCol = dfp[:pcds ]
    setValues = Set(["RG24 4PA", "CB3 0AP", "CR9 1DG", "EN1 3XY", "NW1W 8JN", "N22 7SG", "L36 9YU", "OX1 9TY", "CV21 9NL", "SY1 1NA", "L1 5BL", "DY10 3TE", "NG5 1DD", "OX16 9AA", "E5 0LL", "OX1 2AQ", "WF1 2AP", "BN11 4HY", "S70 2TA", "MK45 2DD", "B91 1DD", "NR27 0LJ", "LN11 0BX", "BS5 6HN", "WC2R 1LA", "CB3 0HZ", "E1 1EJ", "WC1N 2LX", "BS40 8QB", "BS6 6SU", "SE3 7SE", "OX2 7TG"])
    cols=[:pcds, :dointr, :doterm, :osward, :pcon]
    f = x -> in(x, setValues)

    #subset(dfp, :NP => ByRow(x -> rand() < 0.2 && x >= 3))    
    return getArrowSubset(filterRows(filterCol, f), cols , dfp)
end
function main()
    psamfile = raw"ONSPD_NOV_2023_UK.arrow"
    @btime psamTest($psamfile)
    return nothing
end

main()

yielding: 98.880 ms (2701325 allocations: 71.18 MiB)

Note that this is done quite naively in a sense as the arrow isnt dictEncoded so filtering will be slow for longer strings (postcodes probably arent problematic of course). if you wanted to chain together other filter predicates, that would be relatively straightforward since you can call them in a loop and intersect in sequence then apply the subset colums etc.

Regards,

3 Likes