spec/agent_spec.rb in spidr-0.6.1 vs spec/agent_spec.rb in spidr-0.7.0

- old
+ new

@@ -5,10 +5,254 @@ require 'spidr/agent' describe Agent do it_should_behave_like "includes Spidr::Settings::UserAgent" + describe ".start_at" do + module TestAgentStartAt + class ExampleApp < Sinatra::Base + + set :host, 'example.com' + set :port, 80 + + get '/' do + '<html><body>should not get here</body></html>' + end + + get '/entry-point' do + <<~HTML + <html> + <body> + <a href="/link1">link1</a> + <a href="http://other.com/offsite-link">offsite link</a> + <a href="/link2">link2</a> + </body> + </html> + HTML + end + + get '/link1' do + '<html><body>got here</body></html>' + end + + get '/link2' do + '<html><body>got here</body></html>' + end + end + + class OtherApp < Sinatra::Base + + set :host, 'other.com' + set :port, 80 + + get '/offsite-link' do + '<html><body>should not get here</body></html>' + end + + end + end + + subject { described_class } + + let(:host) { 'example.com' } + let(:other_host) { 'other.com' } + let(:url) { URI("http://#{host}/entry-point") } + + let(:app) { TestAgentStartAt::ExampleApp } + let(:other_app) { TestAgentStartAt::OtherApp } + + before do + stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) + stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app) + end + + it "must spider the website starting at the given URL" do + agent = subject.start_at(url) + + expect(agent.history).to be == Set[ + URI("http://#{host}/entry-point"), + URI("http://#{host}/link1"), + URI("http://#{other_host}/offsite-link"), + URI("http://#{host}/link2") + ] + end + end + + describe ".site" do + module TestAgentSite + class ExampleApp < Sinatra::Base + + set :host, 'example.com' + set :port, 80 + + get '/' do + '<html><body>should not get here</body></html>' + end + + get '/entry-point' do + <<~HTML + <html> + <body> + <a href="/link1">link1</a> + <a href="http://other.com/offsite-link">offsite link</a> + <a href="/link2">link2</a> + </body> + </html> + HTML + end + + get '/link1' do + '<html><body>got here</body></html>' + end + + get '/link2' do + '<html><body>got here</body></html>' + end + + end + end + + subject { described_class } + + let(:host) { 'example.com' } + let(:url) { URI("http://#{host}/entry-point") } + + let(:app) { TestAgentSite::ExampleApp } + + before do + stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) + end + + it "must spider the website starting at the given URL" do + agent = subject.site(url) + + expect(agent.history).to be == Set[ + URI("http://#{host}/entry-point"), + URI("http://#{host}/link1"), + URI("http://#{host}/link2") + ] + end + end + + describe ".host" do + module TestAgentHost + class ExampleApp < Sinatra::Base + + set :host, 'example.com' + set :port, 80 + + get '/' do + <<~HTML + <html> + <body> + <a href="/link1">link1</a> + <a href="http://other.com/offsite-link">offsite link</a> + <a href="/link2">link2</a> + </body> + </html> + HTML + end + + get '/link1' do + '<html><body>got here</body></html>' + end + + get '/link2' do + '<html><body>got here</body></html>' + end + + end + end + + subject { described_class } + + let(:host) { 'example.com' } + let(:app) { TestAgentHost::ExampleApp } + + before do + stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) + end + + it "must spider the website starting at the given URL" do + agent = subject.host(host) + + # XXX: for some reason Set#== was returning false, so convert to an Array + expect(agent.history.to_a).to be == [ + URI("http://#{host}/"), + URI("http://#{host}/link1"), + URI("http://#{host}/link2") + ] + end + end + + describe ".domain" do + module TestAgentDomain + class ExampleApp < Sinatra::Base + + set :host, 'example.com' + set :port, 80 + + get '/' do + <<~HTML + <html> + <body> + <a href="/link1">link1</a> + <a href="http://sub.example.com/subdomain-link">subdomain link</a> + <a href="/link2">link2</a> + </body> + </html> + HTML + end + + get '/link1' do + '<html><body>got here</body></html>' + end + + get '/link2' do + '<html><body>got here</body></html>' + end + + end + + class SubDomainApp < Sinatra::Base + + set :host, 'sub.example.com' + set :port, 80 + + get '/subdomain-link' do + '<html><body>should get here</body></html>' + end + + end + end + + subject { described_class } + + let(:domain) { 'example.com' } + let(:domain_app) { TestAgentDomain::ExampleApp } + + let(:subdomain) { 'sub.example.com' } + let(:subdomain_app) { TestAgentDomain::SubDomainApp } + + before do + stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app) + stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app) + end + + it "must spider the domain and subdomains starting at the given domain" do + agent = subject.domain(domain) + + # XXX: for some reason Set#== was returning false, so convert to an Array + expect(agent.history.to_a).to be == [ + URI("http://#{domain}/"), + URI("http://#{domain}/link1"), + URI("http://#{subdomain}/subdomain-link"), + URI("http://#{domain}/link2") + ] + end + end + describe "#initialize" do it "should not be running" do expect(subject).to_not be_running end @@ -30,10 +274,72 @@ it "should initialize the #session_cache" do expect(subject.sessions).to be_kind_of(SessionCache) end + context "when the proxy: keyword argument is given" do + let(:proxy) do + Spidr::Proxy.new(host: 'example.com') + end + + subject { described_class.new(proxy: proxy) } + + it "must initialize the #proxy of #session_cache" do + expect(subject.sessions.proxy).to be(proxy) + end + end + + context "when the open_timeout: keyword argument is given" do + let(:open_timeout) { 5 } + + subject { described_class.new(open_timeout: open_timeout) } + + it "must initialize the #open_timeout of #session_cache" do + expect(subject.sessions.open_timeout).to eq(open_timeout) + end + end + + context "when the ssl_timeout: keyword argument is given" do + let(:ssl_timeout) { 5 } + + subject { described_class.new(ssl_timeout: ssl_timeout) } + + it "must initialize the #ssl_timeout of #session_cache" do + expect(subject.sessions.ssl_timeout).to eq(ssl_timeout) + end + end + + context "when the read_timeout: keyword argument is given" do + let(:read_timeout) { 5 } + + subject { described_class.new(read_timeout: read_timeout) } + + it "must initialize the #read_timeout of #session_cache" do + expect(subject.sessions.read_timeout).to eq(read_timeout) + end + end + + context "when the continue_timeout: keyword argument is given" do + let(:continue_timeout) { 5 } + + subject { described_class.new(continue_timeout: continue_timeout) } + + it "must initialize the #continue_timeout of #session_cache" do + expect(subject.sessions.continue_timeout).to eq(continue_timeout) + end + end + + context "when the keep_alive_timeout: keyword argument is given" do + let(:keep_alive_timeout) { 5 } + + subject { described_class.new(keep_alive_timeout: keep_alive_timeout) } + + it "must initialize the #keep_alive_timeout of #session_cache" do + expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout) + end + end + it "should initialize the #cookie_jar" do expect(subject.cookies).to be_kind_of(CookieJar) end it "should initialize the #auth_store" do @@ -384,11 +690,19 @@ end context "frames" do app do get '/' do - %{<html><body><frameset><frame src="/frame" /></frameset></body></html>} + <<~HTML + <html> + <body> + <frameset> + <frame src="/frame" /> + </frameset> + </body> + </html> + HTML end get '/frame' do %{<html><body><a href="/link">link</a></body></html>} end @@ -612,11 +926,18 @@ get '/' do %{<html><body><a href="/redirect">redirect</a></body></html>} end get '/redirect' do - %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>} + <<~HTML + <html> + <head> + <meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /> + </head> + <body>Redirecting...</body> + </html> + HTML end get '/link' do %{<html><body>got here</body></html>} end @@ -672,11 +993,18 @@ subject { described_class.new(host: host) } app do get '/' do - %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>} + <<~HTML + <html> + <body> + <a href="http://google.com/">external link</a> + <a href="/link">local link</a> + </body> + </html> + HTML end get '/link' do %{<html><body>got here</body></html>} end @@ -724,17 +1052,31 @@ context "when :depth is set" do include_context "example App" app do get '/' do - %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>} + <<~HTML + <html> + <body> + <a href="/left?d=1">left</a> + <a href="/right?d=1">right</a> + </body> + </html> + HTML end - get %r{^/left|/right} do + get %r{/left|/right} do d = Integer(params['d']) - %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>} + <<~HTML + <html> + <body> + <a href="/left?d=#{d+1}">left</a> + <a href="/right?d=#{d+1}">right</a> + </body> + </html> + HTML end end context "depth 0" do subject { described_class.new(host: host, max_depth: 0) } @@ -772,10 +1114,17 @@ ) end app do get '/' do - %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>} + <<~HTML + <html> + <body> + <a href="/secret">don't follow this link</a> + <a href="/pub">follow this link</a> + </body> + </html> + HTML end get '/pub' do %{<html><body>got here</body></html>} end