You need Arrow.write("outputFile.arrow",largerTableOutput, file=false)
to allow for streaming.
However, this is not needed, you can just use Arrow.append
:
MassivePartionedTable_Input = Arrow.Stream("inputFile.arrow")
for eachPartition in MassivePartionedTable_Input
df_eachPartition = DataFrame(eachPartition)
largerTableOuput = DoesSomeThingOnThisPartition( df_eachPartition )
Arrow.append("outputFile.arrow", largerTableOutput)
end
Example (input has 2 partitions):
julia> s = Arrow.Stream("test.arrow")
Arrow.Stream(Arrow.ArrowBlob[Arrow.ArrowBlob(UInt8[0xff, 0xff, 0xff, 0xff, 0xa8, 0x00, 0x00, 0x00, 0x10, 0x00 … 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00], 1, 600)], 1, nothing, Symbol[], Type[], nothing, Dict{Int64, Arrow.DictEncoding}(), Dict{Int64, Arrow.Flatbuf.Field}(), true, Base.RefValue{Union{Nothing, Symbol}}(nothing))
julia> collect(s)
2-element Vector{Any}:
Arrow.Table with 1 rows, 2 columns, and schema:
:a Int64
:b Int64
Arrow.Table with 1 rows, 2 columns, and schema:
:a Int64
:b Int64
julia> for x in s
df = DataFrame(x)
@show df
Arrow.append("test2.arrow", df)
end
df = 1×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 1 2
df = 1×2 DataFrame
Row │ a b
│ Int64 Int64
─────┼──────────────
1 │ 2 3
julia> read("test.arrow") == read("test2.arrow")
true