# using Images
using HTTP
# using libwebp_jll
# using ImageView
using CSV
using ProgressMeter
function get_html(url)
r = HTTP.get(url,
[("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0")];
connect_timeout = 5,
connection_limit = 1000,
readtimeout = 3,
status_exception = false
)
@assert isa(r.body, Vector{UInt8}) "body is not Vector{UInt8}!!!\n"
r
end
function webp_decode(bytes::Vector{UInt8})::Matrix{RGB{N0f8}}
height = [0]
width = [0]
ccall((:WebPGetInfo, "libwebp"), Int, (Ptr{UInt8}, Csize_t, Ptr{Int}, Ptr{Int}), Base.unsafe_convert(Ptr{UInt8},bytes), length(bytes), Ptr{Int}(Base.unsafe_convert(Ptr{Int64}, width)), Ptr{Int}(Base.unsafe_convert(Ptr{Int64}, height)))
x = ccall((:WebPDecodeRGB, "libwebp"), Ptr{UInt8}, (Ptr{UInt8}, Csize_t, Ptr{Int}, Ptr{Int}), Base.unsafe_convert(Ptr{UInt8},bytes), length(bytes), Ptr{Int}(Base.unsafe_convert(Ptr{Int64}, width)), Ptr{Int}(Base.unsafe_convert(Ptr{Int64}, height)))
img = [RGB(reinterpret(N0f8,unsafe_load(x, 3*(width[1]*i+j)+1)),reinterpret(N0f8,unsafe_load(x, 3*(width[1]*i+j)+2)),reinterpret(N0f8,unsafe_load(x, 3*(width[1]*i+j)+3)),) for (i,j) in Iterators.product(0:height[1]-1, 0:width[1]-1)];
end
# function open_webp(fpath::String)::Matrix{RGB{N0f8}}
# bytes = read(fpath);
# webp_decode(bytes)
# end
function read_webimg(url::AbstractString, io::IO, iolock) #::Matrix{RGB{N0f8}}
global lk
r = get_html(url)
headers = Dict(r.headers)
type = headers["Content-Type"]
content_len = haskey(headers, "Content-Length") ? parse(Int64, headers["Content-Length"]) : 0
# @assert r.status == 200 "r.status != 200 $(url)"
# @assert startswith(type, "image/") """startswith(type, "image/") is false"""
if r.status == 200 && startswith(type, "image/")
type = split(type, "/")[end]
bts = r.body
# img = type == "webp" ? webp_decode(bts) : load(bts |> IOBuffer)
# @assert size(img)[1] in 200:1600 && size(img)[2] in 200:1600 && max(size(img)...) /min(size(img)...) < 3 "图片大小不符合要求 $(size(img))"
# if size(img)[1] in 200:1600 && size(img)[2] in 200:1600 && max(size(img)...) /min(size(img)...) < 3
return content_len, type, bts
# end
end
while !trylock(iolock)
sleep(0.001)
end
println(io, "\t", url)
unlock(iolock)
0, "", Vector{UInt8}()
end
complete(url_main, url) = startswith(url, "http") ? url : (startswith(url, "//") ? "https:"*url : match(r"https?:\/\/.*?(?=\/)", url_main).match * url)
function main(i, url)
global lk
global io
buflock = ReentrantLock()
buf = IOBuffer()
println(buf, url)
try
html = get_html(String(url))
if html.status == 200
html = html.body |> String
imgs_t = [Threads.@spawn read_webimg(x.match, buf, buflock) for x in eachmatch(r"(https?:[\S]*?.(jpg|jpeg|png|gif|bmp|bytes)(?=\ ))", html)]
append!(imgs_t, [Threads.@spawn read_webimg(complete(url, x.captures[1]), buf, buflock) for x in eachmatch(r"""<img [^>]*?src *?= *?"(.*?)(?=")""", html)])
imgs = map(fetch, imgs_t)
(sz, index) = length(imgs) > 0 ? findmax(x->x[1], imgs) : (0, 0)
if sz>2000
res = imgs[index]
open("""target_max_img/$(i).$(res[2])""", "w") do fo
write(fo, res[3])
end
else
end
end
catch e
println(buf, "\t", e)
end
while !trylock(lk)
sleep(0.01)
end
println(io, buf.data |> String)
flush(io)
unlock(lk)
end
io = open("log_down_webp", "w")
lk = ReentrantLock()
# save("test.jpg", read_webimg("https://www.thisisnotdietfood.com/wp-content/uploads/2019/06/baconcheeseburgergrilledcheesecasserole-13-min-min.jpg"))
csv_data = CSV.Rows("origin_data.csv")
urls_todo = enumerate(csv_data)
# main(1, urls_todo[1][2].targetUrl)
len = 0
for (i,row) in urls_todo
global len = i
end
p = Progress(len)
task_running = Dict{Int, Task}()
for (i, row) in urls_todo
# main(i, row.targetUrl)
task_running[i] = Threads.@spawn main(i, row.targetUrl)
next!(p)
while length(task_running) > 30
for (k, t) in task_running
if istaskfailed(t)
@show fetch(t)
end
end
# yield()
sleep(1)
filter!(x->!(istaskdone(x[2])||istaskfailed(x[2])), task_running)
# println(length(task_running))
end
end
close(io)
Copy.
Terminate, by os, during running.