Hi,
Many thanks for the response, Dan. I came to the same conclusion and altered as follows:
function writewithschema(io, parts, rows, st, sch, dictrow, compress, kw)
comp = get(COMPRESSORS, compress, nothing)
schtyp = schematype(sch)
meta = Dict("avro.schema" => JSON3.write(schtyp))
if comp !== nothing
meta["avro.codec"] = String(compress)
end
sync = _cast(NTuple{16, UInt8}, rand(UInt128))
buf = write((magic=MAGIC, meta=meta, sync=sync); schema=FileHeaderRecordType)
Base.write(io, buf)
@debug 1 "wrote file header from bytes 1:$(pos - 1)"
i = 1
while true
# if rows didn't have schema or length, we materialized w/ Tables.dictrowtable
nrow = length(rows)
@debug 1 "writing block count ($nrow) at pos = $pos"
rowsstate = iterate(rows)
pos = 1
if rowsstate === nothing
bytes = UInt8[]
pos = 0
else
row, rowst = rowsstate
# calc nbytes on all rows to find max, then allocate bytes
bytesperrow = nbytes(schtyp, row)
while true
rowsstate = iterate(rows, rowst)
rowsstate === nothing && break
row, rowst = rowsstate
nb = nbytes(schtyp, row)
if nb > bytesperrow
bytesperrow = nb
end
end
rowsstate = iterate(rows)
row, rowst = rowsstate
blen = trunc(Int, nrow * bytesperrow * 1.05) # add 5% cushion
bytes = Vector{UInt8}(undef, blen)
n = 1
nb = nbytes(schtyp, row)
while true
pos = writevalue(Binary(), schtyp, row, bytes, pos, blen, kw)
rowsstate = iterate(rows, rowst)
rowsstate === nothing && break
row, rowst = rowsstate
nb = nbytes(schtyp, row)
bytesperrow += nb
n += 1
end
end
# compress
if comp !== nothing
finalbytes = transcode(comp[Threads.threadid()], unsafe_wrap(Base.Array, pointer(bytes), pos - 1))
else
finalbytes = bytes
end
block = Block(nrow, view(finalbytes, 1:length(finalbytes)), sync)
buf = write(block; schema=BlockType)
Base.write(io, buf)
state = iterate(parts, st)
state === nothing && break
part, st = state
rows = Tables.rows(part)
sch = Tables.schema(rows)
if dictrow
rows = Tables.dictrowtable(rows)
end
end
return
end
instead of assessing the first index, the entire table is scanned to establish the maximum row size as that guarantee that the bytes vector is large enough for downstream processing. Definitely bigger than necessary and with a performance cost but will work (memory permitting). I could have summed the bytes and assigned the vector size with this but its not clear to me if this would work in every case.
To your point, there isnt an easy way to address this without altering this method or forcing the sort by size.
Edit: here is the updated issue raised in Avro.jl : Problem writing string columns to avro file · Issue #17 · JuliaData/Avro.jl (github.com) and created the PR here: Updated writewithschema in tables.jl by djholiver · Pull Request #20 · JuliaData/Avro.jl (github.com)
Regards,