require 'aquanaut'
require 'spec_helper'
require 'webmock/rspec'
describe Aquanaut::Worker do
describe "#initialize" do
it "initializes the queue with the target address" do
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
queue = worker.instance_variable_get('@queue')
expected_queue = [URI.parse(target)]
expect(queue).to eq(expected_queue)
end
it "stores the target address in its different components" do
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
domain = worker.instance_variable_get('@domain')
expect(domain.tld).to eq('com')
expect(domain.sld).to eq('example')
expect(domain.trd).to eq('www')
end
end
describe "#internal?" do
it "compares second-level and top-level domain" do
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse('http://www.example.com')
expect(worker.internal?(uri)).to be_true
uri = URI.parse('http://blog.example.com')
expect(worker.internal?(uri)).to be_true
uri = URI.parse('http://www.not-example.com')
expect(worker.internal?(uri)).to be_false
end
it "guards against invalid domains" do
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse('/internal.html')
expect(worker.internal?(uri)).to be_true
end
end
describe "#links" do
it "retrieves no links from a page with no body" do
response = { headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse('http://www.example.com')
expect(worker.links(uri).first).to be_empty
end
it "returns a list of URIs for a page with anchor elements" do
body = <<-BODY
Home
About us
Contact
BODY
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:head, 'www.example.com/home.html').to_return(response)
stub_request(:get, 'www.example.com/home.html').to_return(response)
stub_request(:head, 'www.example.com/about.html').to_return(response)
stub_request(:get, 'www.example.com/about.html').to_return(response)
stub_request(:head, 'www.example.com/contact.html').to_return(response)
stub_request(:get, 'www.example.com/contact.html').to_return(response)
stub_request(:head, 'www.not-example.com').to_return(response)
uris = ['http://www.example.com/home.html',
'http://www.example.com/about.html',
'http://www.example.com/contact.html']
uris.map! { |uri| URI.parse(uri) }
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse('http://www.example.com')
expect(worker.links(uri).first).to eq(uris)
end
it "returns the final location when encountering HTTP 3xx" do
body = 'Follow me'
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
end
it "filters links that reference an external domain directly" do
body = <<-BODY
Home
About us
Contact
Not Example
BODY
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:head, 'www.example.com/home.html').to_return(response)
stub_request(:get, 'www.example.com/home.html').to_return(response)
stub_request(:head, 'www.example.com/about.html').to_return(response)
stub_request(:get, 'www.example.com/about.html').to_return(response)
stub_request(:head, 'www.example.com/contact.html').to_return(response)
stub_request(:get, 'www.example.com/contact.html').to_return(response)
stub_request(:head, 'www.not-example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uris = ['http://www.example.com/home.html',
'http://www.example.com/about.html',
'http://www.example.com/contact.html']
uris.map! { |uri| URI.parse(uri) }
uri = URI.parse('http://www.example.com')
expect(worker.links(uri).first).to eq(uris)
end
it "filters links that reference an external domain indirectly" do
body = <<-BODY
Home
About us
Contact
Moved
BODY
other_domain = 'http://www.not-example.com'
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
forward = { status: 301, headers: { 'Location' => other_domain } }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:head, 'www.example.com/home.html').to_return(response)
stub_request(:get, 'www.example.com/home.html').to_return(response)
stub_request(:head, 'www.example.com/about.html').to_return(response)
stub_request(:get, 'www.example.com/about.html').to_return(response)
stub_request(:head, 'www.example.com/contact.html').to_return(response)
stub_request(:get, 'www.example.com/contact.html').to_return(response)
stub_request(:head, 'www.example.com/moved.html').to_return(forward)
stub_request(:head, other_domain).to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uris = ['http://www.example.com/home.html',
'http://www.example.com/about.html',
'http://www.example.com/contact.html']
uris.map! { |uri| URI.parse(uri) }
uri = URI.parse('http://www.example.com')
expect(worker.links(uri).first).to eq(uris)
end
it "rejects errors raised by Mechanize when retrieving the page" do
response = { status: 500 }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
expect(worker.links(uri).first).to be_empty
end
it "rejects errors raised by Mechanize when checking the links" do
body = <<-BODY
Home
About us
BODY
headers = { 'Content-Type' => 'text/html'}
response = { body: body, headers: headers }
response_500 = { status: 500 }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:head, 'www.example.com/home.html').to_return(response)
stub_request(:head, 'www.example.com/about.html').to_return(response_500)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
uris = [URI.parse('http://www.example.com/home.html')]
expect(worker.links(uri).first).to eq(uris)
end
it "rejects invalid URIs" do
body = 'Invalid'
headers = { 'Content-Type' => 'text/html'}
response = { body: body, headers: headers }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
expect(worker.links(uri).first).to be_empty
end
it "rejects anchors with no href attribute" do
body = 'Empty'
headers = { 'Content-Type' => 'text/html'}
response = { body: body, headers: headers }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
expect(worker.links(uri).first).to be_empty
end
it "rejects links that lead to a timeout" do
body = 'Timeout'
headers = { 'Content-Type' => 'text/html'}
response = { body: body, headers: headers }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:head, 'www.example.com/timeout.html').to_timeout
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
expect(worker.links(uri).first).to be_empty
end
it "rejects links that have already been grabbed" do
body = <<-BODY
Home
Home
BODY
response = { body: body, headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
stub_request(:get, 'www.example.com/home.html').to_return(response)
stub_request(:head, 'www.example.com/home.html').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
uri = URI.parse(target)
result = [URI.parse('http://www.example.com/home.html')]
expect(worker.links(uri).first).to eq(result)
end
end
describe "#explore" do
it "starts the crawling by processing the first queue element" do
response = { headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
worker.explore
queue = worker.instance_variable_get('@queue')
expect(queue).to be_empty
end
it "marks visited sites" do
response = { headers: { 'Content-Type' => 'text/html'} }
stub_request(:get, 'www.example.com').to_return(response)
target = 'http://www.example.com'
worker = Aquanaut::Worker.new(target)
visited = worker.instance_variable_get('@visited')
expect { worker.explore }.to change { visited.size }.by(1)
end
it "skips already visited sites" do
end
end
end