The Prospero Challenge

I just found the “Prospero Challenge” on hackernews and thought it’s a really nice toy example to write a parser and to analyze compile times. Maybe it’s even possible to compile the entire document, the parent repo also looks interesting. I currently don’t have time to give it a try, but if one of you gets nerd sniped, I’d love to see it to learn a little more :slight_smile:

5 Likes

I know this isn’t the point, but I made a straightforward translation of prospero.vm to Julia syntax

# Text of a monologue from The Tempest
function prospero(x,y)
	_0 = 2.95
	_1 = x
	_2 = 8.13008
...
	_1eb7 = min(_1eaf, _1eb6)
	_1eb8 = max(_1ea9,_1eb7)
	_1eb9 = min(_1ea3, _1eb8)
end

and got these timings with my old version of Julia on my old computer.

julia> @time @eval prospero(0.5,0.5)
  2.622024 seconds (1.92 M allocations: 86.854 MiB, 0.54% gc time, 99.75% compilation time)
-0.028191772642285284



julia> @benchmark prospero($rand(), $rand())
BenchmarkTools.Trial: 10000 samples with 8 evaluations.
 Range (min … max):  2.788 μs …  23.913 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.225 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.286 μs ± 530.315 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

        ▂▃▇█▇▁▁
  ▂▂▂▃▄▇███████▆▄▃▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁▁▁▁▂▁▁▁▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂ ▃
  2.79 μs         Histogram: frequency by time        5.55 μs <

 Memory estimate: 0 bytes, allocs estimate: 0.



julia> @time [prospero(x,y) for x in range(-1,1,length=1024) for y in range(-1,1,length=1024)];
  3.065433 seconds (100.15 k allocations: 16.418 MiB, 0.32% gc time, 1.70% compilation time)

You are benchmarking a loop in global scope. It should only involve a single allocation if you do it right, no?

1 Like

Do we have the right to use F16 ? 12GB is better

function process_one_op!(v,op,name,args)
    if haskey(v,name)
        broadcast!(op,v[name],v[args[1]],v[args[2]])
    else
        v[name] = broadcast(op,v[args[1]],v[args[2]])
    end
    return nothing
end
function process_single_op!(v,op,name,args)
    if haskey(v,name)
        broadcast!(op,v[name],v[args[1]])
    else
        v[name] = broadcast(op,v[args[1]])
    end
    return nothing
end
function copytoifexists!(v,name,x)
    if haskey(v,name)
        copyto!(v[name],x)
    else
        v[name] = x
    end
    return nothing
end
function const_change!(v,name,c)
    if haskey(v,name)
        v[name] .= c
    else
        v[name] = [c;;]
    end
    return nothing
end
function process_op!(v,op,name,args,T,x,y)
    if op == "var-x"
        copytoifexists!(v,name,x)
    elseif op == "var-y"
        copytoifexists!(v,name,y)
    elseif op == "const"
        const_change!(v,name,parse(T, args[1]))
    elseif op == "add"
        process_one_op!(v,+,name,args) # v[args[1]] .+ v[args[2]]
    elseif op == "sub"
        process_one_op!(v,-,name,args) #v[args[1]] .- v[args[2]]
    elseif op == "mul"
        process_one_op!(v,*,name,args) #v[args[1]] .* v[args[2]]
    elseif op == "max"
        process_one_op!(v,max,name,args) #max.(v[args[1]], v[args[2]])
    elseif op == "min"
        process_one_op!(v,min,name,args) #min.(v[args[1]], v[args[2]])
    elseif op == "neg"
        process_single_op!(v,-,name,args) #-v[args[1]]
    elseif op == "square"
        process_single_op!(v,x->x^2,name,args) #v[args[1]].^2
    elseif op == "sqrt"
        process_single_op!(v,sqrt,name,args) #sqrt.(v[args[1]])
    else
        error("unknown opcode '$op'")
    end
    return nothing
end

function main(file)
    strip_file = strip(read(file, String))
    T = Float16
    image_size = 1024
    space = range(-1.0f0, 1.0f0, image_size)
    x = T[space[j] for i in 1:image_size, j in 1:image_size]
    y = T[-space[i] for i in 1:image_size, j in 1:image_size]
    v = Dict{String, Matrix{T}}()
    final_key = ""
    @fastmath for line in split(strip_file, "\n")
        if startswith(line, "#")
            continue
        end
        tokens = split(line)
        name = tokens[1]  
        op = tokens[2]   
        @views args = tokens[3:end]
        final_key = name  
        process_op!(v,op,name,args,T,x,y)
    end
    out = v[final_key]
    img = ifelse.(out .< 0, UInt8(255), UInt8(0))
    img = permutedims(img)
    open("out.ppm", "w") do f
        write(f, "P5\n$(image_size) $(image_size)\n255\n")
        write(f, vec(img))
    end
    return nothing
end
GC.gc()
@time main("prospero.vm")

I get around 8s but its really dependent on the machine I think. Sorry for the naming its a mess :')
GPU translation is easier than ever too (just load x and y on the gpu, make the Dict accept CuMatrix and collect before writing)

using CUDA

@fastmath function process_one_op!(v,op,name,args)
    if haskey(v,name)
        broadcast!(op,v[name],v[args[1]],v[args[2]])
    else
        v[name] = broadcast(op,v[args[1]],v[args[2]])
    end
    return nothing
end
@fastmath function process_single_op!(v,op,name,args)
    if haskey(v,name)
        broadcast!(op,v[name],v[args[1]])
    else
        v[name] = broadcast(op,v[args[1]])
    end
    return nothing
end
@fastmath function copytoifexists!(v,name,x)
    if haskey(v,name)
        copyto!(v[name],x)
    else
        v[name] = x
    end
    return nothing
end
@fastmath function const_change!(v,name,c)
    if haskey(v,name)
        v[name] .= c
    else
        v[name] = cu([c;;])
    end
    return nothing
end
@fastmath  function process_op!(v,op,name,args,T,x,y)
    if op == "var-x"
        copytoifexists!(v,name,x)
    elseif op == "var-y"
        copytoifexists!(v,name,y)
    elseif op == "const"
        const_change!(v,name,parse(T, args[1]))
    elseif op == "add"
        process_one_op!(v,+,name,args) # v[args[1]] .+ v[args[2]]
    elseif op == "sub"
        process_one_op!(v,-,name,args) #v[args[1]] .- v[args[2]]
    elseif op == "mul"
        process_one_op!(v,*,name,args) #v[args[1]] .* v[args[2]]
    elseif op == "max"
        process_one_op!(v,max,name,args) #max.(v[args[1]], v[args[2]])
    elseif op == "min"
        process_one_op!(v,min,name,args) #min.(v[args[1]], v[args[2]])
    elseif op == "neg"
        process_single_op!(v,-,name,args) #-v[args[1]]
    elseif op == "square"
        process_single_op!(v,x->x^2,name,args) #v[args[1]].^2
    elseif op == "sqrt"
        process_single_op!(v,sqrt,name,args) #sqrt.(v[args[1]])
    else
        error("unknown opcode '$op'")
    end
    return nothing
end

@fastmath  function main(file)
    strip_file = strip(read(file, String))
    T = Float16
    image_size = 1024
    space = range(-1.0f0, 1.0f0, image_size)
    x = T[space[j] for i in 1:image_size, j in 1:image_size] |> cu
    y = T[-space[i] for i in 1:image_size, j in 1:image_size] |> cu
    v = Dict{String, CuMatrix{T}}()
    final_key = ""
    @fastmath for line in split(strip_file, "\n")
        if startswith(line, "#")
            continue
        end
        tokens = split(line)
        name = tokens[1]  
        op = tokens[2]   
        @views args = tokens[3:end]
        final_key = name  
        process_op!(v,op,name,args,T,x,y)
    end
    out = v[final_key]
    img = ifelse.(out .< 0, UInt8(255), UInt8(0)) |> collect
    img = permutedims(img)
    open("out.ppm", "w") do f
        write(f, "P5\n$(image_size) $(image_size)\n255\n")
        write(f, vec(img))
    end
    return nothing
end
GC.gc()
CUDA.@time main("prospero.vm")

and this gives 1.1s (again for me)

  1.120287 seconds (873.03 k CPU allocations: 34.781 MiB, 2.62% gc time) (7.87 k GPU allocations: 12.618 GiB, 1.34% memmgmt time)

It also leads to the right image obviously.