ArchGDAL - memory issue with vector data

Hello all,

I’m starting to use geo data in Julia with ArcGDAL, after using the R sf package for a while. My use case is simple : I have WGS 84 positions and I want to guess the timezone they belong to.
I have the worldwide map of the timezone from: https://github.com/evansiroky/timezone-boundary-builder/releases/download/2020a/timezones.geojson.zip.

The code works, it delivers the right positions, but memory consumption is very high. It seems proportional to the number of points_without_timezone to tag. I often run out of memory (RAM). I used do-block for ArchGDAL feature functions to reduce the memory footprint, but it seems ineffective.

Any advice to improve it ? Should I release features in some way to avoid memory leaks ? Thx

# SET UP ###################################################################
using Revise, ArchGDAL, HTTP, DataFrames, ZipFile

# Points that need timezones
points_without_timezone = [
    (longitude = 1.834, latitude = 50.136),
    (longitude = 3.0975, latitude = 50.57),
    (longitude = -1.939833, latitude = 49.725167),
    (longitude = -0.456167, latitude = 49.18),
    (longitude = 1.181667, latitude = 49.383),
    (longitude = 4.155333, latitude = 49.209667),
    (longitude = -4.412, latitude = 48.444167),
    (longitude = -3.473167, latitude = 48.825833),
    (longitude = -1.734, latitude = 48.068833),
    (longitude = 0.110167, latitude = 48.4455),
    (longitude = 2.384333, latitude = 48.716833),
    (longitude = 4.02, latitude = 48.324667),
    (longitude = 5.959833, latitude = 48.581),
    (longitude = 7.640333, latitude = 48.5495),
    (longitude = -3.218333, latitude = 47.294333),
    (longitude = -1.608833, latitude = 47.15),
    (longitude = 0.727333, latitude = 47.4445),
    (longitude = 2.359833, latitude = 47.059167),
    (longitude = 5.088333, latitude = 47.267833),
    (longitude = 7.51, latitude = 47.614333),
    (longitude = -1.4115, latitude = 46.046833),
    (longitude = 0.314333, latitude = 46.593833),
    (longitude = 1.175, latitude = 45.861167),
    (longitude = 3.149333, latitude = 45.786833),
    (longitude = 3.764, latitude = 45.0745),
    (longitude = 5.077833, latitude = 45.7265),
    (longitude = -0.691333, latitude = 44.830667),
    (longitude = 1.396667, latitude = 44.745),
    (longitude = 3.0195, latitude = 44.1185),
    (longitude = 4.733, latitude = 44.581167),
    (longitude = 6.502333, latitude = 44.565667),
    (longitude = -0.500167, latitude = 43.909833),
    (longitude = 0.0, latitude = 43.188),
    (longitude = 1.106833, latitude = 43.005333),
    (longitude = 1.378833, latitude = 43.621),
    (longitude = 3.963167, latitude = 43.577),
    (longitude = 5.216, latitude = 43.437667),
    (longitude = 5.940833, latitude = 43.079333),
    (longitude = 7.209, latitude = 43.648833),
    (longitude = 2.872833, latitude = 42.737167),
    (longitude = 8.792667, latitude = 41.918),
    (longitude = 9.485167, latitude = 42.540667),
    (longitude = 47.289667, latitude = -11.582667),
    (longitude = 42.712, latitude = -17.054667),
    (longitude = 40.340667, latitude = -22.344167),
    (longitude = 54.520667, latitude = -15.887667),
    (longitude = 55.528667, latitude = -20.8925),
    (longitude = 77.569167, latitude = -37.795167),
    (longitude = 51.856667, latitude = -46.4325),
    (longitude = 70.243333, latitude = -49.352333),
    (longitude = 45.282833, latitude = -12.8055),
    (longitude = -56.179167, latitude = 46.766333),
    (longitude = -61.004, latitude = 16.335),
    (longitude = -62.852167, latitude = 17.9015),
    (longitude = -61.516333, latitude = 16.264),
    (longitude = -60.875333, latitude = 14.7745),
    (longitude = -60.995667, latitude = 14.595333),
    (longitude = -54.031667, latitude = 5.4855),
    (longitude = -52.365333, latitude = 4.822333),
    (longitude = -51.804667, latitude = 3.890667),
    (longitude = -54.028333, latitude = 3.640167),
    (longitude = 140.001, latitude = -66.663167),
]

# RETRIEVE TIMEZONE ##################################################################
function get_timezone(points = Array{Tuple{Real,Real},1})
    # Define the source to fetch the shapefile as geojson and fetch
    source_geojson = "https://github.com/evansiroky/timezone-boundary-builder/releases/download/2020a/timezones.geojson.zip"
    geojson_content = HTTP.get(source_geojson)
    # In a temp directory with auto closing, unzip
    mktempdir() do path
        # Fetch online
        open(joinpath(path, "tz_map.geojson"), "w") do io
            write(
                io,
                # The first file in the archve contaons the geojson shapefile
                read(ZipFile.Reader(IOBuffer(geojson_content.body)).files[1]),
            )
        end

        # Open vector data archive
        ArchGDAL.read(joinpath(path, "tz_map.geojson")) do dataset
            # Access the first layer
            ArchGDAL.getlayer(dataset, 0) do layer
                # Container for the results (beyond the do-blocks)
                global results_df
                results = []
                # Iterate over the features
                for feat_number = 0:(ArchGDAL.nfeature(layer)-1)
                    # Open feature to access the geom
                    ArchGDAL.getfeature(layer, feat_number) do feature
                        # For a geom (time zone), search if every point belong to it
                        for point in points
                            if ArchGDAL.within(
                                ArchGDAL.createpoint(point[1], point[2]),
                                ArchGDAL.getgeom(feature, 0),
                            )
                                # Record the timezone of the point
                                push!(
                                    results,
                                    (
                                        point[1],
                                        point[2],
                                        ArchGDAL.getfield(feature, "tzid"),
                                    ),
                                )
                            end
                        end
                    end
                end
                # Return in a dataframe, even the points that where time zon is not retrieved
                results_df = DataFrame(results)
                rename!(results_df, ["longitude", "latitude", "timezone"])
                # Merge results into
                results_df = leftjoin(
                    DataFrame(points),
                    results_df,
                    on = ["longitude", "latitude"],
                )
            end
        end
    end
    return results_df
end

# Resuts
points_with_timezone = get_timezone(points_without_timezone)

I am not exactly sure, but could you just move the call to ArchGDAL.getgeom(feature, 0) out of your for loop. Because at the moment you are newly loading the geometry of the timezone for every point. And these geometries are only freed after your do block for the feature.

I would also convert the points into ArchGDAL points before your first for loop because otherwise you have to do this for every timezone anew.

2 Likes

Your first tip drastically reduced the memory requirements ! Thanks

Btw you can also read from the zip file directly (see this blogpost for details):

ArchGDAL.read("/vsizip/timezones.geojson.zip") do dataset
   ...
end

and loop over the features in a layer:

for feature in layer
    ...
end
2 Likes