Type inference slowdown in binary file i/o


#1

I’m writing code to read and write 1D arrays to binary files. I want the read function to infer the element type and read back the data, I had initially a large slowdown that I partially resolved by splitting the read function into two. I am still however getting a difference in the read times that depends on the length of the array read back when compared to a fully typed function.

I would appreciate an explanation and a suggestion of how to fix the issue.

This is the write function:

function write_bin(x::Array{T, 1}, fileName::String)::Int64 where T
  
  # Open the file
  io = open(fileName,"w")
  
  # Cast this number to make sure we know its type
  write(io, Int64(size(x)[1]))

  # Get the type as a string
  typ = repr(T)
  # Write the length of the type string
  write(io, Int64(length(typ)))

  # Now write the type string
  for i in eachindex(typ)
  	write(io, Char(typ[i]))
  end
  
  # Now write the array
  for i in eachindex(x)
      write(io, x[i])
  end
  
  # Clean up
  close(io)
  
  return 0;
end

This is the inferred type read function:

# Sub Function which speeds up the read
function read_bin(io::IO, ::Type{T}, n::Int64) where T

  # The array to be returned
  x = Array{T, 1}(uninitialized, n)

  @time for i in eachindex(x)
    x[i] = read(io, T)
  end

  close(io)

  return x
end

# The read function
function read_bin(fileName::String)

	# Open the file
  io = open(fileName, "r")

  # Read the total number of elements in the resulting array
  n = read(io, Int64)
  # Read the length of the type name
  nt = read(io, Int64)

  # println("Number of elements: $n")

  # Then read the type name
  cName = Array{Char}(uninitialized, nt)

  for i in eachindex(cName)
    cName[i] = read(io, Char)
  end

  # The return type
  T = eval(Symbol(String(cName)))

  # The data
  x = read_bin(io, T, n)

  return x
end

and the explicitly typed read function:

function read_bin(fileName::String, ::Type{T}) where T

  # Open the file
  io = open(fileName, "r")

  # Read the total number of elements in the resulting array
  n = read(io, Int64)
  # Read the length of the type name
  nt = read(io, Int64)

  # Then read the type name
  cName = Array{Char}(uninitialized, nt)

  for i in eachindex(cName)
    cName[i] = read(io, Char)
  end

  # The array to be returned
  x = Array{T, 1}(uninitialized, n)

  @time for i in eachindex(x)
    x[i] = read(io, T)
  end

  close(io)

  return x
end

This is the benchmark (generates 800MB file size):

# Warm up
binFile = "data.bin"

n = 100;
arr1 = rand(Float64, n);
write_bin(arr1, binFile);
arr2 = read_bin(binFile);
arr3 = read_bin(binFile, eltype(arr1));

rm(binFile)

# Timed write read
n = 100_000_000;
arr1 = rand(Float64, n);
write_bin(arr1, binFile);
arr2 = read_bin(binFile);
arr3 = read_bin(binFile, eltype(arr1));

rm(binFile)

#2

Okay, I think I see issue. I can just create an object with a type parameter for binary files and call typed functions on these which will already resolve the object type.


#4

For very large data sets I tried with 8GB all there methods are just as efficient as each other. I just feared that the problem would get worse for very large data sets but it doesn’t.