require 'aquanaut' require 'spec_helper' require 'webmock/rspec' describe Aquanaut::Worker do describe "#initialize" do it "initializes the queue with the target address" do target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) queue = worker.instance_variable_get('@queue') expected_queue = [URI.parse(target)] expect(queue).to eq(expected_queue) end it "stores the target address in its different components" do target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) domain = worker.instance_variable_get('@domain') expect(domain.tld).to eq('com') expect(domain.sld).to eq('example') expect(domain.trd).to eq('www') end end describe "#internal?" do it "compares second-level and top-level domain" do target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse('http://www.example.com') expect(worker.internal?(uri)).to be_true uri = URI.parse('http://blog.example.com') expect(worker.internal?(uri)).to be_true uri = URI.parse('http://www.not-example.com') expect(worker.internal?(uri)).to be_false end it "guards against invalid domains" do target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse('/internal.html') expect(worker.internal?(uri)).to be_true end end describe "#links" do it "retrieves no links from a page with no body" do response = { headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse('http://www.example.com') expect(worker.links(uri).first).to be_empty end it "returns a list of URIs for a page with anchor elements" do body = <<-BODY Home About us Contact BODY response = { body: body, headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) stub_request(:get, 'www.example.com').to_return(response) stub_request(:head, 'www.example.com/home.html').to_return(response) stub_request(:get, 'www.example.com/home.html').to_return(response) stub_request(:head, 'www.example.com/about.html').to_return(response) stub_request(:get, 'www.example.com/about.html').to_return(response) stub_request(:head, 'www.example.com/contact.html').to_return(response) stub_request(:get, 'www.example.com/contact.html').to_return(response) stub_request(:head, 'www.not-example.com').to_return(response) uris = ['http://www.example.com/home.html', 'http://www.example.com/about.html', 'http://www.example.com/contact.html'] uris.map! { |uri| URI.parse(uri) } target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse('http://www.example.com') expect(worker.links(uri).first).to eq(uris) end it "returns the final location when encountering HTTP 3xx" do body = 'Follow me' response = { body: body, headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) end it "filters links that reference an external domain directly" do body = <<-BODY Home About us Contact Not Example BODY response = { body: body, headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) stub_request(:head, 'www.example.com/home.html').to_return(response) stub_request(:get, 'www.example.com/home.html').to_return(response) stub_request(:head, 'www.example.com/about.html').to_return(response) stub_request(:get, 'www.example.com/about.html').to_return(response) stub_request(:head, 'www.example.com/contact.html').to_return(response) stub_request(:get, 'www.example.com/contact.html').to_return(response) stub_request(:head, 'www.not-example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uris = ['http://www.example.com/home.html', 'http://www.example.com/about.html', 'http://www.example.com/contact.html'] uris.map! { |uri| URI.parse(uri) } uri = URI.parse('http://www.example.com') expect(worker.links(uri).first).to eq(uris) end it "filters links that reference an external domain indirectly" do body = <<-BODY Home About us Contact Moved BODY other_domain = 'http://www.not-example.com' response = { body: body, headers: { 'Content-Type' => 'text/html'} } forward = { status: 301, headers: { 'Location' => other_domain } } stub_request(:get, 'www.example.com').to_return(response) stub_request(:head, 'www.example.com/home.html').to_return(response) stub_request(:get, 'www.example.com/home.html').to_return(response) stub_request(:head, 'www.example.com/about.html').to_return(response) stub_request(:get, 'www.example.com/about.html').to_return(response) stub_request(:head, 'www.example.com/contact.html').to_return(response) stub_request(:get, 'www.example.com/contact.html').to_return(response) stub_request(:head, 'www.example.com/moved.html').to_return(forward) stub_request(:head, other_domain).to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uris = ['http://www.example.com/home.html', 'http://www.example.com/about.html', 'http://www.example.com/contact.html'] uris.map! { |uri| URI.parse(uri) } uri = URI.parse('http://www.example.com') expect(worker.links(uri).first).to eq(uris) end it "rejects errors raised by Mechanize when retrieving the page" do response = { status: 500 } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) expect(worker.links(uri).first).to be_empty end it "rejects errors raised by Mechanize when checking the links" do body = <<-BODY Home About us BODY headers = { 'Content-Type' => 'text/html'} response = { body: body, headers: headers } response_500 = { status: 500 } stub_request(:get, 'www.example.com').to_return(response) stub_request(:head, 'www.example.com/home.html').to_return(response) stub_request(:head, 'www.example.com/about.html').to_return(response_500) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) uris = [URI.parse('http://www.example.com/home.html')] expect(worker.links(uri).first).to eq(uris) end it "rejects invalid URIs" do body = 'Invalid' headers = { 'Content-Type' => 'text/html'} response = { body: body, headers: headers } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) expect(worker.links(uri).first).to be_empty end it "rejects anchors with no href attribute" do body = 'Empty' headers = { 'Content-Type' => 'text/html'} response = { body: body, headers: headers } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) expect(worker.links(uri).first).to be_empty end it "rejects links that lead to a timeout" do body = 'Timeout' headers = { 'Content-Type' => 'text/html'} response = { body: body, headers: headers } stub_request(:get, 'www.example.com').to_return(response) stub_request(:head, 'www.example.com/timeout.html').to_timeout target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) expect(worker.links(uri).first).to be_empty end it "rejects links that have already been grabbed" do body = <<-BODY Home Home BODY response = { body: body, headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) stub_request(:get, 'www.example.com/home.html').to_return(response) stub_request(:head, 'www.example.com/home.html').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) uri = URI.parse(target) result = [URI.parse('http://www.example.com/home.html')] expect(worker.links(uri).first).to eq(result) end end describe "#explore" do it "starts the crawling by processing the first queue element" do response = { headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) worker.explore queue = worker.instance_variable_get('@queue') expect(queue).to be_empty end it "marks visited sites" do response = { headers: { 'Content-Type' => 'text/html'} } stub_request(:get, 'www.example.com').to_return(response) target = 'http://www.example.com' worker = Aquanaut::Worker.new(target) visited = worker.instance_variable_get('@visited') expect { worker.explore }.to change { visited.size }.by(1) end it "skips already visited sites" do end end end