I refactored somewhat the code, using as in @rocco_sprmnt21βs proposal the raw version of the circular, and putting dedicated code into functions for
-
extracting latest circular GCN
-
extracting GRB if present
-
extracting table if present,
plus some corrections for special cases encountered whoile testing (see comments inside the functions)
@time using HTTP,DataFrames,CSV
function getlatestcirc()
urln="https://gcn.nasa.gov/circulars/" # no raw version => we will work with html
resp = HTTP.get(urln) # throw exception if page not found
body=String(resp.body)
# search for <li value="12345"> , where n=12345, c1=c2=", c3=>
lid=findfirst("<li value=",body)
lid=last(lid) # convert to position of '='
lid=lid+1; c1=body[lid:lid]; if c1 != "\""; error("getlastcirc: invalid c1 $c1 $(body[lid-10:lid+10])");end
gcn=body[lid+1:lid+5]
lid=lid+5 # skip gcn
lid=lid+1; c2=body[lid:lid]; if c2 != "\""; error("getlastcirc: invalid c2 $c2 $(body[lid-10:lid+10])");end
lid=lid+1; c3=body[lid:lid]; if c3 != ">" ; error("getlastcirc: invalid c3 $c3 $(body[lid-10:lid+10])");end
# and decode it
gcn=parse(Int,gcn)
gcn
end
function getgrb(lines,gcn)
grb=nothing
grbt="no_grb"
for line in lines
if occursin("GRB",line)
# println("GRB:",line)
t=line
t=replace(t,r".*GRB" => "")
t=replace(t,r"^ " => "")
t=replace(t,r"^:" => "")
t=replace(t,r"[ |:].*$" => "")
if isnothing(grb)
grb=t
grbt=grb
elseif grb != t
tt=t
tt=replace(tt,r"\..*$" => "")
if !startswith(grb,tt)
error("Error, multiple GRB ($grb,$t) in circ $gcn")
end
end
end
# if occursin("robotic elescope",line)
# # println("robotic:",line)
# elseif occursin("elescope",line)
# # println("escope:",line)
# end
end
grb,grbt
end
function getdf(lines,gcn)
dbg=false
lfirsttable=nothing; llasttable=nothing
# gcn 33228 table starts by Sources | Tmid-T0 (day) | UT (start) ...
# gcn 33228 table has blank lines inside table, table is terminated by "----"
for (i,line) in pairs(lines)
# println(" i=",i," line=",line)
if startswith(line,"Tmid-T0")
if isnothing(lfirsttable)
lfirsttable=i
else
error("Error, multiple tables (Tmid-T0)in circ $gcn")
end
end
# gcn 33064 is terminated by blank line, not Filter
if !isnothing(lfirsttable) && isnothing(llasttable) # check table not terminater by filter but by blankline
if (i>lfirsttable+3) && isnothing(llasttable)
if line == ""
llasttable=i-1 # premature end of table (no Filter line)
dbg && println("setting llasttable (blk) ",llasttable)
end
end
end
if !isnothing(lfirsttable) && startswith(line,"Filter")
if isnothing(llasttable)
llasttable=i-1
dbg && println("setting llasttable (Filter) ",llasttable)
else
error("Error, multiple tables (Filter)in circ $gcn")
end
end
end
# normalize names in header line
lines[lfirsttable]=replace(lines[lfirsttable],"." => "")
cltxt=join(lines[lfirsttable:llasttable],'\n')
if gcn == 33064
cltxt=replace(cltxt,"15.3 |" => "15.3")
end
haspvert=occursin(" P| ",cltxt)
# if gcn == 34262
if haspvert
dbg && println("gcn $gcn haspvert")
cltxt=replace(cltxt," P| " => " P! ")
end
dbg && println("\n cltxt=\n",cltxt,"\n endofcltxt")
df = CSV.read(IOBuffer(cltxt), DataFrame, delim='|',skipto=3, normalizenames=true)
dbg && @show df
if haspvert
println("rechange")
df.Filt=replace.(df.Filt," P! " => " P| ")
dbg && @show df
end
df
end
function doanalysis()
writedetailedcsv=false
clearline="\r" * (" "^60) * "\r"
dfg=nothing
circfirst=33037
# circfirst=33546
# circfirst=33411
# circfirst=34242
# circfirst=33728
# circlast=33044
# circlast=33228
# circlast=33124
circlast=getlatestcirc()
println("lastcirc is ",circlast)
# circlast=circfirst
# circlast=circfirst=33064
# circlast=circfirst=34262
println("Will peek from $circfirst to $circlast")
# for gcn in 33037:33287 # was 33037:33187, but mention of 33211 as good candidate
for gcn in circfirst:circlast
print(clearline)
print("\r peeking at gcn: $gcn ")
try
# we use the "raw" version of circ bulletin, because it is plain text and not html
url = "https://gcn.nasa.gov/circulars/$gcn/raw"
# with "exception=false", a 404 error does not raise an exception
resp = HTTP.get(url;status_exception=false)
status=resp.status
print(" ",status," "); # println();
if status == 404 ; println("page not found"); continue; end
txt = String(resp.body)
hastelescope=occursin("elescope",txt)
hasGRB=occursin("GRB",txt)
hasrobotic=occursin("robotic telescope",txt)
#hastmidt0=occursin("Tmid-T0",txt) # not : gcn 3328 have table with tmi-to in the middle of the line, but no Filter endline, etc.
hastmidt0=occursin("\nTmid-T0",txt)
ok= hastelescope && hastmidt0
if hastelescope || hasGRB || hasrobotic || hastmidt0
cmt=""
if hasrobotic; cmt=cmt * " robotic"; end
if hastelescope; cmt=cmt * " telescope"; end
if hasGRB; cmt=cmt * " GRB"; end
if hastmidt0; cmt=cmt * " tmidt0"; end
if ok; cmt=cmt * " OK"; end
cmt=cmt*" "
print(" :",cmt)
end
if hastmidt0 && occursin("Tmid-T0 (day)",txt)
# @info "skipping bad header Tmid-T0 (day)"
println("skipping bad header Tmid-T0 (day)")
continue
end
grb=nothing
grbt="no_grb"
if ok
lines=readlines(IOBuffer(txt))
grb,grbt=getgrb(lines,gcn)
if isnothing(grb)
if hasGRB
println(" !no grb with hasGRB !")
error("\ncirc $gcn : no grb when hasGRB!!!")
else
# println("no_grb")
println("grb:",grbt)
end
else
println("grb:",grbt)
end
df=getdf(lines,gcn)
df.GCN=[gcn for i in 1:nrow(df)]
df.GRB=[grbt for i in 1:nrow(df)]
if isnothing(dfg) # gcn == 33037
dfg=df
else
dfg=vcat(dfg,df,cols=:union)
end # if gcn is first
if writedetailedcsv
CSV.write("data-$(gcn).csv",df)
end
end
catch e
# error("error at circ $gcn") # e
println("error caught at circ $gcn ",e)
rethrow()
end # trycatch
end # for loop
println()
if !isnothing(dfg)
namedfg="data-all.csv"
CSV.write(namedfg,dfg)
@info "namedfg written successfully"
println("\ndescription"); println(describe(dfg))
println("\ncontent"); show(dfg)
println()
else
@info "no dfg to write"
end # !isnothing
dfg
end # function doanalysis
dfg=doanalysis()
println("tkumar done.")
Hope it helps.