Problem of unnecessarily iterating many times in for loop

For the following code it gives output many times(about 40 times) for only three true matches . Unnecessary iteration will decrease performance as far as i know it should give only three output .

julia> using CSV, DataFrames,HTTP

julia> for x in 34000:34135
               url="https://gcn.nasa.gov/circulars/$x"
               txt=String((HTTP.get(url)))
           if occursin("report on behalf of the Swift/UVOT team",txt)==true || continue
                       url
                       for e in url
                               @show hb,he=findfirst(r"Filter"i,txt)
                       end
               else
               end
       end

(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491


What are you trying to do? What’s your question? This code is doing exactly what you told it to do, in other wordsyou’re making unnecessary loops.

1 Like

where i am making unnecessary loops ? :thinking:

This is probably not doing what you think it’s doing.

You’re trying to skip the e in URL loop here right?

minor note: the ==true is not necessary as occursin already outputs the boolean value needed for the if statement.

2 Likes

Why are you iterating every character or your URL btw?

I am iterating in those url because i want to extract data of all webpages of those satisfying occursin condition. and removing ==true does not make any good changes.

URL is just this fixed string

but x is changing every time.

julia> for x in 34000:34135
               url="https://gcn.nasa.gov/circulars/$x"
               txt=String((HTTP.get(url)))
           if occursin("report on behalf of the Swift/UVOT team",txt)
                   @show url
               else
               end
       end
url = "https://gcn.nasa.gov/circulars/34008"
url = "https://gcn.nasa.gov/circulars/34049"
url = "https://gcn.nasa.gov/circulars/34135"

As far as I can tell, you should only need.

for x in 34000:34135
    url="https://gcn.nasa.gov/circulars/$x"
    txt=String((HTTP.get(url)))
    if occursin("report on behalf of the Swift/UVOT team",txt)
        @show hb,he=findfirst(r"Filter"i,txt)
    end
end

If you do

for e in url
   println(e)
end

then you will see what @jling is saying. That loop is just counting over the individual characters in the URL string, it is not iterating over the individual URLs. That is what your first loop is doing.,

2 Likes

Try code given below in Pluto notebook. I see it giving correct answer again and again repeatedly and restarting again .

using CSV, DataFrames,HTTP
begin
	for x in 34000:34135
        url="https://gcn.nasa.gov/circulars/$x"
        txt=String((HTTP.get(url)))
        if occursin("report on behalf of the Swift/UVOT team",txt)
            @show hb,he=findfirst(r"Filter"i,txt)
        end
    end
end

Pluto issues?

julia> using HTTP


julia> for x in 34000:34135
               url="https://gcn.nasa.gov/circulars/$x"
               txt=String((HTTP.get(url)))
               if occursin("report on behalf of the Swift/UVOT team",txt)
                   @show hb,he=findfirst(r"Filter"i,txt)
               end
           end
(hb, he) = findfirst(r"Filter"i, txt) = 10545:10550
(hb, he) = findfirst(r"Filter"i, txt) = 10610:10615
(hb, he) = findfirst(r"Filter"i, txt) = 10486:10491
using HTTP,DataFrames,CSV
begin
	for x in 33211:33212
            url="https://gcn.nasa.gov/circulars/$x"
            txt=String((HTTP.get(url)))
        if occursin("report on behalf of the Swift/UVOT team",txt)
             hb,he=findfirst(r"^Filter"im,txt)
			 lr,_=findnext("\n\nThe",txt,he)
			 cltxt=replace(txt[hb:lr], " +/- "=>"+/-", r"  +(\w)"=>s"\t\1",r"  +(>)"=>s"\t>")
			 @show df=CSV.read(IOBuffer(cltxt), DataFrame, delim='\t')
        end
    end
end

image

The above code gives correct table but inserting it in below code it produces missing column :upside_down_face:

using HTTP,DataFrames,CSV
function doanalysis()
       dfg=nothing
       for x in 33211:33212
           print("\r peeking at GCN $x ")
           try
               url = "https://gcn.nasa.gov/circulars/$x/raw"
               resp = HTTP.get(url) 
               status=resp.status
               print(" ",status," "); 
               if status == 404 ; println("status=",status); continue; end          
               txt = String(resp.body)

			   if occursin(r"GRB ?\d{6}([A-G]|(\.\d{2}))?",txt)
				    m=match(r"GRB ?\d{6}([A-G]|(\.\d{2}))?",txt)
				    print(m.match)
			   end
			  if occursin("report on behalf of the Swift/UVOT team",txt)
             hb,he=findfirst(r"^Filter"im,txt)
			 lr,_=findnext("\n\nThe",txt,he)
			 cltxt=replace(txt[hb:lr], " +/- "=>"+/-", r"  +(\w)"=>s"\t\1",r"  +(>)"=>s"\t>")
			 @show df=CSV.read(IOBuffer(cltxt), DataFrame, delim='\t')
			
				#df.GCN=[x for i in 1:nrow(df)]
				#df.GRB=[m.match for i in 1:nrow(df)]
				#rename!(df, :Mag => :Magnitude)
			#@show DataFrame(df)
           end
		   catch e
			   print("Error")
		   end
	   end
end   
doanalysis()

image

The URL and txt = lines are different between the two codes. What happens is you use the url = and txt = lines from doanalysis in the original (upper) version of the code?

1 Like

What changes should i do in code if i want to keep /raw in URL ?

What happened when you used the raw url in the code that worked?

It contains missing column. See last output above.

I’m not great with web scraping or regex, so I’m not sure how much I can help there. But maybe the first thing to do is grab the txt for both URLs (with and without /raw and see if the cltxt is the same. It seems like it might not be, then you might need to update replace.

It looks like your missing Mag data are in the Exp(s) column with some whitespace between the desired Exp(s) and the Mag… You probably accidentally removed or escaped the separating ‘\t’.

1 Like