require 'spec_helper' require 'example_app' require 'settings/user_agent_examples' require 'spidr/agent' describe Agent do it_should_behave_like "includes Spidr::Settings::UserAgent" describe ".start_at" do module TestAgentStartAt class ExampleApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do '
should not get here' end get '/entry-point' do <<~HTML link1 offsite link link2 HTML end get '/link1' do 'got here' end get '/link2' do 'got here' end end class OtherApp < Sinatra::Base set :host, 'other.com' set :port, 80 get '/offsite-link' do 'should not get here' end end end subject { described_class } let(:host) { 'example.com' } let(:other_host) { 'other.com' } let(:url) { URI("http://#{host}/entry-point") } let(:app) { TestAgentStartAt::ExampleApp } let(:other_app) { TestAgentStartAt::OtherApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app) end it "must spider the website starting at the given URL" do agent = subject.start_at(url) expect(agent.history).to be == Set[ URI("http://#{host}/entry-point"), URI("http://#{host}/link1"), URI("http://#{other_host}/offsite-link"), URI("http://#{host}/link2") ] end end describe ".site" do module TestAgentSite class ExampleApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do 'should not get here' end get '/entry-point' do <<~HTML link1 offsite link link2 HTML end get '/link1' do 'got here' end get '/link2' do 'got here' end end end subject { described_class } let(:host) { 'example.com' } let(:url) { URI("http://#{host}/entry-point") } let(:app) { TestAgentSite::ExampleApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) end it "must spider the website starting at the given URL" do agent = subject.site(url) expect(agent.history).to be == Set[ URI("http://#{host}/entry-point"), URI("http://#{host}/link1"), URI("http://#{host}/link2") ] end end describe ".host" do module TestAgentHost class ExampleApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do <<~HTML link1 offsite link link2 HTML end get '/link1' do 'got here' end get '/link2' do 'got here' end end end subject { described_class } let(:host) { 'example.com' } let(:app) { TestAgentHost::ExampleApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app) end it "must spider the website starting at the given URL" do agent = subject.host(host) # XXX: for some reason Set#== was returning false, so convert to an Array expect(agent.history.to_a).to be == [ URI("http://#{host}/"), URI("http://#{host}/link1"), URI("http://#{host}/link2") ] end end describe ".domain" do module TestAgentDomain class ExampleApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do <<~HTML link1 subdomain link link2 HTML end get '/link1' do 'got here' end get '/link2' do 'got here' end end class SubDomainApp < Sinatra::Base set :host, 'sub.example.com' set :port, 80 get '/subdomain-link' do 'should get here' end end end subject { described_class } let(:domain) { 'example.com' } let(:domain_app) { TestAgentDomain::ExampleApp } let(:subdomain) { 'sub.example.com' } let(:subdomain_app) { TestAgentDomain::SubDomainApp } before do stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app) stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app) end it "must spider the domain and subdomains starting at the given domain" do agent = subject.domain(domain) # XXX: for some reason Set#== was returning false, so convert to an Array expect(agent.history.to_a).to be == [ URI("http://#{domain}/"), URI("http://#{domain}/link1"), URI("http://#{subdomain}/subdomain-link"), URI("http://#{domain}/link2") ] end end describe "#initialize" do it "should not be running" do expect(subject).to_not be_running end it "should default :delay to 0" do expect(subject.delay).to be 0 end it "should initialize #history" do expect(subject.history).to be_empty end it "should initialize #failures" do expect(subject.failures).to be_empty end it "should initialize #queue" do expect(subject.queue).to be_empty end it "should initialize the #session_cache" do expect(subject.sessions).to be_kind_of(SessionCache) end context "when the proxy: keyword argument is given" do let(:proxy) do Spidr::Proxy.new(host: 'example.com') end subject { described_class.new(proxy: proxy) } it "must initialize the #proxy of #session_cache" do expect(subject.sessions.proxy).to be(proxy) end end context "when the open_timeout: keyword argument is given" do let(:open_timeout) { 5 } subject { described_class.new(open_timeout: open_timeout) } it "must initialize the #open_timeout of #session_cache" do expect(subject.sessions.open_timeout).to eq(open_timeout) end end context "when the ssl_timeout: keyword argument is given" do let(:ssl_timeout) { 5 } subject { described_class.new(ssl_timeout: ssl_timeout) } it "must initialize the #ssl_timeout of #session_cache" do expect(subject.sessions.ssl_timeout).to eq(ssl_timeout) end end context "when the read_timeout: keyword argument is given" do let(:read_timeout) { 5 } subject { described_class.new(read_timeout: read_timeout) } it "must initialize the #read_timeout of #session_cache" do expect(subject.sessions.read_timeout).to eq(read_timeout) end end context "when the continue_timeout: keyword argument is given" do let(:continue_timeout) { 5 } subject { described_class.new(continue_timeout: continue_timeout) } it "must initialize the #continue_timeout of #session_cache" do expect(subject.sessions.continue_timeout).to eq(continue_timeout) end end context "when the keep_alive_timeout: keyword argument is given" do let(:keep_alive_timeout) { 5 } subject { described_class.new(keep_alive_timeout: keep_alive_timeout) } it "must initialize the #keep_alive_timeout of #session_cache" do expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout) end end it "should initialize the #cookie_jar" do expect(subject.cookies).to be_kind_of(CookieJar) end it "should initialize the #auth_store" do expect(subject.authorized).to be_kind_of(AuthStore) end end describe "#history=" do let(:previous_history) { Set[URI('http://example.com')] } before { subject.history = previous_history } it "should be able to restore the history" do expect(subject.history).to eq(previous_history) end context "when given an Array of URIs" do let(:previous_history) { [URI('http://example.com')] } let(:converted_history) { Set.new(previous_history) } it "should convert the Array to a Set" do expect(subject.history).to eq(converted_history) end end context "when given an Set of Strings" do let(:previous_history) { Set['http://example.com'] } let(:converted_history) do previous_history.map { |url| URI(url) }.to_set end it "should convert the Strings to URIs" do expect(subject.history).to eq(converted_history) end end end describe "#failures=" do let(:previous_failures) { Set[URI('http://example.com')] } before { subject.failures = previous_failures } it "should be able to restore the failures" do expect(subject.failures).to eq(previous_failures) end context "when given an Array of URIs" do let(:previous_failures) { [URI('http://example.com')] } let(:converted_failures) { Set.new(previous_failures) } it "should convert the Array to a Set" do expect(subject.failures).to eq(converted_failures) end end context "when given an Set of Strings" do let(:previous_failures) { Set['http://example.com'] } let(:converted_failures) do previous_failures.map { |url| URI(url) }.to_set end it "should convert the Strings to URIs" do expect(subject.failures).to eq(converted_failures) end end end describe "#queue=" do let(:previous_queue) { [URI('http://example.com')] } before { subject.queue = previous_queue } it "should be able to restore the queue" do expect(subject.queue).to eq(previous_queue) end context "when given an Set of URIs" do let(:previous_queue) { Set[URI('http://example.com')] } let(:converted_queue) { previous_queue.to_a } it "should convert the Set to an Array" do expect(subject.queue).to eq(converted_queue) end end context "when given an Array of Strings" do let(:previous_queue) { Set['http://example.com'] } let(:converted_queue) { previous_queue.map { |url| URI(url) } } it "should convert the Strings to URIs" do expect(subject.queue).to eq(converted_queue) end end end describe "#to_hash" do let(:queue) { [URI("http://example.com/link")] } let(:history) { Set[URI("http://example.com/")] } subject do described_class.new do |agent| agent.queue = queue agent.history = history end end it "should return the queue and history" do expect(subject.to_hash).to be == { history: history, queue: queue } end end context "when spidering" do include_context "example App" context "local links" do context "relative paths" do app do get '/' do %{relative link} end get '/link' do 'got here' end end it "should expand relative paths of links" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end context "that contain directory escapes" do app do get '/' do %{link} end get '/link' do 'got here' end end it "should expand relative paths before visiting them" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end end end context "absolute paths" do app do get '/' do %{absolute path} end get '/link' do 'got here' end end it "should visit links with absolute paths" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end context "that contain directory escapes" do app do get '/' do %{link} end get '/link' do 'got here' end end it "should expand absolute links before visiting them" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end end end end context "remote links" do app do get '/' do %{absolute link} end get '/link' do 'got here' end end it "should visit absolute links" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end context "that contain directory escapes" do app do get '/' do %{link} end get '/link' do 'got here' end end it "should expand absolute links before visiting them" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end end end context "self-referential links" do app do get '/' do %{same page} end end it "should ignore self-referential links" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end context "circular links" do app do get '/' do %{link} end get '/link' do %{previous page} end end it "should ignore links that have been previous visited" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link") ] end end context "link cycles" do app do get '/' do %{first link} end get '/link1' do %{next link} end get '/link2' do %{back to the beginning} end end it "should ignore links that have been previous visited" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link1"), URI("http://#{host}/link2"), ] end end context "fragment links" do app do get '/' do %{fragment link} end end it "should ignore fragment links" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end context "empty links" do context "empty href" do app do get '/' do %{empty link blank link no href} end end it "should ignore links with empty hrefs" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end context "whitespace href" do app do get '/' do %{blank link} end end it "should ignore links containing only whitespace" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end context "missing href" do app do get '/' do %{no href} end end it "should ignore links with no href" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end end context "frames" do app do get '/' do <<~HTML HTML end get '/frame' do %{link} end get '/link' do %{got here} end end it "should visit the frame and links within the frame" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/frame"), URI("http://#{host}/link") ] end end context "iframes" do app do get '/' do %{} end get '/iframe' do %{link} end get '/link' do %{got here} end end it "should visit the iframe and links within the iframe" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/iframe"), URI("http://#{host}/link") ] end end context "javascript links" do app do get '/' do %{javascript link} end end it "should ignore javascript: links" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end context "when the link has an onclick action" do app do get '/' do %{onclick link} end end it "should ignore links with onclick actions" do expect(subject.history).to be == Set[ URI("http://#{host}/") ] end end end context "cookies" do app do get '/' do response.set_cookie 'visited', 'true' %{link} end get '/link' do if request.cookies['visited'] == 'true' %{got here} else halt 401, "Cookie not set" end end end it "should record cookies and send them with each request" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/link"), ] expect(subject.cookies[host]).to be == {'visited' => 'true'} end end context "redirects" do context "300" do app do get '/' do %{redirect} end get '/redirect' do redirect to('/link'), 300 end get '/link' do %{got here} end end it "should follow HTTP 300 redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end context "301" do app do get '/' do %{redirect} end get '/redirect' do redirect to('/link'), 301 end get '/link' do %{got here} end end it "should follow HTTP 301 redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end context "302" do app do get '/' do %{redirect} end get '/redirect' do redirect to('/link'), 302 end get '/link' do %{got here} end end it "should follow HTTP 302 redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end context "303" do app do get '/' do %{redirect} end get '/redirect' do redirect to('/link'), 303 end get '/link' do %{got here} end end it "should follow HTTP 303 redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end context "307" do app do get '/' do %{redirect} end get '/redirect' do redirect to('/link'), 307 end get '/link' do %{got here} end end it "should follow HTTP 307 redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end context "meta-refresh" do app do get '/' do %{redirect} end get '/redirect' do <<~HTML Redirecting... HTML end get '/link' do %{got here} end end it "should follow meta-refresh redirects" do expect(subject.history).to be == Set[ URI("http://#{host}/"), URI("http://#{host}/redirect"), URI("http://#{host}/link"), ] end end end context "Basic-Auth" do app do set :user, 'admin' set :password, 'swordfish' get '/' do %{private link} end get '/private' do auth = Rack::Auth::Basic::Request.new(request.env) if auth.provided? && auth.basic? && auth.credentials && \ auth.credentials == [settings.user, settings.password] %{got here} else headers['WWW-Authenticate'] = %{Basic realm="Restricted Area"} halt 401, "