Announcing: package download stats!

As much as I agree with all of these points, I think this is where the community can hopefully step in because I don’t really have time to do all of that.

Here are some details of how the processing works. At the core of this is the logs table, which we upload new log data into regularly. Here is the schema of that table:

 Row │ name                type
     │ String              String
─────┼───────────────────────────────────────
   1 │ row_id              NUMBER(38,0)
   2 │ time_utc            TIMESTAMP_NTZ(9)
   3 │ request             VARCHAR(16777216)
   4 │ request_method      VARCHAR(16777216)
   5 │ request_url         VARCHAR(16777216)
   6 │ package             VARCHAR(16777216)
   7 │ package_uuid        VARCHAR(16777216)
   8 │ package_hash        VARCHAR(16777216)
   9 │ registry            VARCHAR(16777216)
  10 │ registry_uuid       VARCHAR(16777216)
  11 │ registry_hash       VARCHAR(16777216)
  12 │ artifact            VARCHAR(16777216)
  13 │ artifact_hash       VARCHAR(16777216)
  14 │ request_http        VARCHAR(16777216)
  15 │ status              NUMBER(38,0)
  16 │ body_bytes_sent     NUMBER(38,0)
  17 │ http_user_agent     VARCHAR(16777216)
  18 │ request_time        FLOAT
  19 │ julia_version       VARCHAR(16777216)
  20 │ julia_system        VARCHAR(16777216)
  21 │ julia_ci_variables  VARCHAR(16777216)
  22 │ julia_interactive   BOOLEAN
  23 │ julia_pkg_server    VARCHAR(16777216)
  24 │ request_id          VARCHAR(16777216)
  25 │ cache_miss          VARCHAR(16777216)
  26 │ pkgserver           VARCHAR(16777216)
  27 │ hll_hash            BINARY(8388608)

The following view definition is used to add some classification logic to the logs:

create or replace view logs_classified as
select *,
-- requester attributes
  case
    when "request_method" not in ('GET', 'HEAD') then null
    when "julia_ci_variables" regexp '.*=[^n].*' then 'ci'
    when "julia_ci_variables" is not null        then 'user'
                                                 else null
  end as "client_type",
  regexp_substr("julia_version", '^[0-9]+\.[0-9]+\.[0-9]+(-[^.]+)?') as "julia_version_prefix",
  regexp_substr("pkgserver", '^[a-z-]+') as "region",
-- requested attributes
  case
    when "request_method" not in ('GET', 'HEAD') then null
    when "request_url" regexp '/meta(|/.*)'      then 'meta'
    when "request_url" = '/registries'           then 'registries'
    when "registry" is not null                  then 'registry'
    when "package"  is not null                  then 'package'
    when "artifact" is not null                  then 'artifact'
                                                 else null
  end as "resource_type",
-- status classification
  200 <= "status" and "status" < 300 as "status_success",
-- time attributes
  date("time_utc") as "date"
from logs;

The following Julia code is then used to generate all the rollups as views (which are then used to generate the files that are uploaded):

function rollup_sql(name::AbstractString, fields::AbstractVector{<:AbstractString})
    push!(rollup_names, name)
    keys_sql = join(map(repr, fields), ", ")
    vals_sql = """
      , hll_estimate(hll_combine("hll_hash")) as "request_addrs"
      , count(*) as "request_count"
    """
    if !("status" in fields)
        vals_sql *= """
          , sum(iff("status_success", 1, 0)) as "successes"
        """
    end
    vals_sql *= """
      , sum(iff("cache_miss" = 'miss', 1, 0)) as "cache_misses"
      , sum("body_bytes_sent") as "body_bytes_sent"
      , sum("request_time") as "request_time"
    """
    if !("date" in fields)
        vals_sql *= """
          , count(distinct "date") as "date_count"
          , min("date") as "date_min"
          , max("date") as "date_max"
        """
    end
    where_clauses = String[]
    "package_uuid"  in fields && push!(where_clauses, "\"resource_type\" = 'package'")
    "registry_uuid" in fields && push!(where_clauses, "\"resource_type\" = 'registry'")
    "artifact_hash" in fields && push!(where_clauses, "\"resource_type\" = 'artifact'")
    if "date" in fields
        push!(where_clauses, "dateadd(year, -1, current_date()) <= \"date\"")
        push!(where_clauses, "\"date\" < current_date()")
    end
    from_sql = isempty(where_clauses) ?
        "logs_classified" :
        "logs_classified where $(join(where_clauses, "\n  and "))"
    return """
    create or replace view $name as
    select $keys_sql
    $(chomp(vals_sql))
    from $from_sql
    group by $keys_sql
    order by $keys_sql;
    """
end

rollups = [
    "client_types"     => ["client_type"],
    "julia_systems"    => ["julia_system", "client_type"],
    "julia_versions"   => ["julia_version_prefix", "client_type"],
    "resource_types"   => ["resource_type", "status", "client_type"],
    "package_requests" => ["package_uuid", "status", "client_type"],
]

rollup_names = ["log_recency"]
for (name, fields) in rollups
    println(rollup_sql(name, fields))
    println(rollup_sql("$(name)_by_region", [fields; "region"]))
    println(rollup_sql("$(name)_by_date", [fields; "date"]))
    println(rollup_sql("$(name)_by_region_by_date", [fields; "region"; "date"]))
end

You can run this code yourself to see all the rollup definitions, but here’s the first one for free:

create or replace view client_types as
select "client_type"
  , hll_estimate(hll_combine("hll_hash")) as "request_addrs"
  , count(*) as "request_count"
  , sum(iff("status_success", 1, 0)) as "successes"
  , sum(iff("cache_miss" = 'miss', 1, 0)) as "cache_misses"
  , sum("body_bytes_sent") as "body_bytes_sent"
  , sum("request_time") as "request_time"
  , count(distinct "date") as "date_count"
  , min("date") as "date_min"
  , max("date") as "date_max"
from logs_classified
group by "client_type"
order by "client_type";

Most of the aggregate functions are standard SQL. The hll_combine and hll_estimate functions are Snowflake-specific: hll_combine aggregates a bunch of HyperLogLog sketch values (represented as binary blobs) into an aggregate sketch describing the entire collection that has been aggregated over; hll_estimate takes an aggregate sketch and produces a cardinality estimate.

The hll_hash field is the HyperLogLog hash value of the request IP address of the log line in the same binary format that Snowflake understands. This hash is computed based on the IP address before the logs are uploaded for storage and then the IP address is discarded. (We do store a separate copy of the logs that retains the IP address in a different S3 bucket that is set to auto-delete after 30 days, which is only used to diagnose and mitigate attacks and never enters our data analysis systems.) The code to compute the HyperLogLog hash can be found here:

https://github.com/JuliaPackaging/PkgServerLogAnalysis.jl/blob/master/src/hll_hashing.jl

The rest of the code to process and upload package server logs can also be found in that repo. At a high level, the hashing works by encrypting each IP address using a 16-round Feistel cipher with SHA512 as its round function (this is serious overkill in every way and should definitely be secure; also, the security only impacts making it hard to intentionally craft an IP address with any specific hash value; the real reason I went to town here was to get a better shot at good statistical properties). It then uses 30 bits of the encrypted value to generate a 17-bit HyperLogLog hash value (12 bits for the bucket, log2(19) for the exponential sample value). Using encryption guarantees that every hash value has multiple IPv4 and IPv6 addresses that correspond to it, which would be likely but not guaranteed if we used a hash function instead of encryption.

6 Likes