Using Blink for web scraping à la Selenium?

I have been a user of RSelenium for web scraping websites which make use of JavaScript. I am considering implementing a Julia binding for Selenium, but I wanted to first check whether Blink would work as a web-driver to perform the usual WebDriver API… For example,

  • Finding elements by (CSS / XPATH)
  • Querying elements (e.g., attributes)
  • Interacting with elements (e.g., clicking, sending text)
  • Navigate/Back
  • Obtaining the page source

Hopefully within Julia rather than doing all the operations in JavaScript.

Examples would be very appreciated!

3 Likes

I tried the same approach but they need to fix some bugs.

Here is a draft of the code

For testing it out, build the Docker image selenium/standalone-chrome with port 4444 locally.

using HTTP, JSON3, Parameters
import Base: close
"""
	Session
"""
struct Session
	endpoint::String
	id::String
	timeouts::Dict{Symbol,Int}
	function Session(;addr::AbstractString = "http://localhost",
                      port::Integer = 4444)
		# addr = "http://localhost"
		# port = 4444
		endpoint = "$addr:$port/wd/hub"
		response = HTTP.post("$endpoint/session",
	                         [("Content-Type" => "application/json")],
	                         """{"desiredCapabilities": {"browserName": "chrome"} }""")
		@assert response.status == 200
		json = JSON3.read(response.body)
		@unpack sessionId, value = json
		@unpack timeouts = value
		new("$addr:$port/wd/hub", sessionId, Dict{Symbol,Int}(timeouts))
	end
end
"""
	Element
"""
struct Element
	session::Session
	id::String
end
function close(session::Session)
	@unpack endpoint, id = session
	HTTP.delete("$endpoint/session/$id")
	nothing
end
session = Session()
function session_status(session::Session)
	@unpack endpoint = session
	response = HTTP.get("$endpoint/status",
						[("Content-Type" => "application/json")])
	@assert response.status == 200
	JSON3.read(response.body).status
end
function current_url(session::Session)
	@unpack endpoint, id = session
	response = HTTP.get("$endpoint/session/$id/url",
						[("Content-Type" => "application/json")])
	@assert response.status == 200
	JSON3.read(response.body).value
end
function navigate!(session::Session, url::AbstractString)
	@unpack endpoint, id = session
	# url = "https://google.com"
	response = HTTP.post("$endpoint/session/$id/url",
						 [("Content-Type" => "application/json")],
						 """{"url": "$url"}""")
	@assert response.status == 200
	nothing
end
navigate!(session, "https://google.com")
function back!(session::Session)
	@unpack endpoint, id = session
	response = HTTP.post("$endpoint/session/$id/back",
						 [("Content-Type" => "application/json")])
	@assert response.status == 200
	nothing
end
back!(session)
function forward!(session::Session)
	@unpack endpoint, id = session
	response = HTTP.post("$endpoint/session/$id/forward",
						 [("Content-Type" => "application/json")])
	@assert response.status == 200
	nothing
end
forward!(session)
function timeouts!(session::Session;
				   script::Union{Integer,Nothing} = nothing,
				   pageLoad::Union{Integer,Nothing} = nothing,
				   implicit::Union{Integer,Nothing} = nothing)
	@unpack endpoint, id = session
	# script = nothing
	# pageLoad = 30
	# implicit = 35
	script_text = isa(script, Integer) ? string("\"script\": $script") : nothing
	pageLoad_text = isa(pageLoad, Integer) ? string("\"pageLoad\": $pageLoad") : nothing
	implicit_text = isa(implicit, Integer) ? string("\"implicit\": $implicit") : nothing
	body = join(filter(!isnothing, [script_text, pageLoad_text, implicit_text]), ", ")
	response = HTTP.post("$endpoint/session/$id/timeouts",
						 [("Content-Type" => "application/json")],
						 """{$body}""")
	@assert response.status == 200
	if !isnothing(script_text)
		session.timeouts[:script] = script
	end
	if !isnothing(pageLoad_text)
		session.timeouts[:pageLoad] = pageLoad
	end
	if !isnothing(implicit_text)
		session.timeouts[:implicit_text] = implicit_text
	end
	nothing
end
current_url(session)
function refresh!(session::Session)
	@unpack endpoint, id = session
	response = HTTP.post("$endpoint/session/$id/timeouts",
						 [("Content-Type" => "application/json")])
	@assert response.status == 200
	nothing
end
refresh!(session)
function findelement(session::Session, value::AbstractString; css::Bool = true)
	# css = false
	@unpack endpoint, id = session
	# value = """//input[@name='q']"""
	method = css ? "css" : "xpath"
	response = HTTP.post("$endpoint/session/$id/element",
						 [("Content-Type" => "application/json")],
						 """{"using": "$method", "value": "$value"}""")
	@assert response.status == 200
	json = JSON3.read(response.body)
	Element(session, last(first(json.value)))
end
element = findelement(session, """//input[@name='q']""", css = false)
function element_text(element::Element)
	@unpack endpoint, id = element.session
	element_id = element.id
	response = HTTP.get("$endpoint/session/$id/element/$element_id/text",
						[("Content-Type" => "application/json")])
	@assert response.status == 200
	JSON3.read(response.body).value
end
element_text(element)
# Still trying to get `element_text!` working
function element_text!(element::Element, value::AbstractString)
	@unpack endpoint, id = element.session
	element_id = element.id
	value = "Nosferican"
	response = HTTP.post("$endpoint/session/$id/element/$element_id/value",
						 [("Content-Type" => "application/json")],
						 """{"text": "$value"}""")
end

Based on this tutorial.

3 Likes
2 Likes

Does WebDriver.jl support automation with FireFox? On the docs manual page it says The required argument being the browser name (chrome).

Aye. You can use Selenium Firefox for instance and pass the browser name firefox.

1 Like

Thanks, that looks great. But I am unsure how to proceed in setting this up. Should someone pull that docker image and then from the Git repo readme for the docker image it lists various ways to start the image: docker run -d -p 4444:4444 -v /dev/shm:/dev/shm selenium/standalone-firefox:3.141.59-20200525

Then there are steps for the use of the images, in https://github.com/SeleniumHQ/docker-selenium#example-spawn-a-container-for-testing-in-firefox, so would the commands:

$ docker run -d --name selenium-hub -p 4444:4444 selenium/hub:3.141.59-20200525
$ FF=$(docker run --rm --name=fx \
    --link selenium-hub:hub -v /e2e/uploads:/e2e/uploads \
    -v /dev/shm:/dev/shm \
    selenium/node-firefox:3.141.59-20200525)

what do you think is the best approach?

The easiest is to use a Docker compose file (e.g., docker-compose.yml),

version: '3.7'
services:
  # WebDriver: Selenium
  selenium:
    image: selenium/standalone-firefox:3
    container_name: selenium
    ports:
      - target: 4444
        published: 4444
        protocol: tcp
        mode: host

You can then spin it up with

docker-compose -f docker-compose.yml up -d selenium

You can confirm the container is running with

docker ps

For the code,

using WebDriver
wd = RemoteWebDriver(
    Capabilities("firefox"),
    host = "localhost",
    port = 4444,
    )
# New Session
session = Session(wd)
1 Like