Arrow stream usage clarification

You need Arrow.write("outputFile.arrow",largerTableOutput, file=false) to allow for streaming.

However, this is not needed, you can just use Arrow.append:

MassivePartionedTable_Input = Arrow.Stream("inputFile.arrow")
for eachPartition in MassivePartionedTable_Input
  df_eachPartition  = DataFrame(eachPartition) 
  largerTableOuput   = DoesSomeThingOnThisPartition( df_eachPartition )
  Arrow.append("outputFile.arrow", largerTableOutput)
end

Example (input has 2 partitions):

julia> s = Arrow.Stream("test.arrow")
Arrow.Stream(Arrow.ArrowBlob[Arrow.ArrowBlob(UInt8[0xff, 0xff, 0xff, 0xff, 0xa8, 0x00, 0x00, 0x00, 0x10, 0x00  …  0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00], 1, 600)], 1, nothing, Symbol[], Type[], nothing, Dict{Int64, Arrow.DictEncoding}(), Dict{Int64, Arrow.Flatbuf.Field}(), true, Base.RefValue{Union{Nothing, Symbol}}(nothing))

julia> collect(s)
2-element Vector{Any}:
 Arrow.Table with 1 rows, 2 columns, and schema:
 :a  Int64
 :b  Int64
 Arrow.Table with 1 rows, 2 columns, and schema:
 :a  Int64
 :b  Int64

julia> for x in s
           df = DataFrame(x)
           @show df
           Arrow.append("test2.arrow", df)
       end
df = 1×2 DataFrame
 Row │ a      b
     │ Int64  Int64
─────┼──────────────
   1 │     1      2
df = 1×2 DataFrame
 Row │ a      b
     │ Int64  Int64
─────┼──────────────
   1 │     2      3

julia> read("test.arrow") == read("test2.arrow")
true
4 Likes