spec/agent_spec.rb in spidr-0.6.1 vs spec/agent_spec.rb in spidr-0.7.0
- old
+ new
@@ -5,10 +5,254 @@
require 'spidr/agent'
describe Agent do
it_should_behave_like "includes Spidr::Settings::UserAgent"
+ describe ".start_at" do
+ module TestAgentStartAt
+ class ExampleApp < Sinatra::Base
+
+ set :host, 'example.com'
+ set :port, 80
+
+ get '/' do
+ '<html><body>should not get here</body></html>'
+ end
+
+ get '/entry-point' do
+ <<~HTML
+ <html>
+ <body>
+ <a href="/link1">link1</a>
+ <a href="http://other.com/offsite-link">offsite link</a>
+ <a href="/link2">link2</a>
+ </body>
+ </html>
+ HTML
+ end
+
+ get '/link1' do
+ '<html><body>got here</body></html>'
+ end
+
+ get '/link2' do
+ '<html><body>got here</body></html>'
+ end
+ end
+
+ class OtherApp < Sinatra::Base
+
+ set :host, 'other.com'
+ set :port, 80
+
+ get '/offsite-link' do
+ '<html><body>should not get here</body></html>'
+ end
+
+ end
+ end
+
+ subject { described_class }
+
+ let(:host) { 'example.com' }
+ let(:other_host) { 'other.com' }
+ let(:url) { URI("http://#{host}/entry-point") }
+
+ let(:app) { TestAgentStartAt::ExampleApp }
+ let(:other_app) { TestAgentStartAt::OtherApp }
+
+ before do
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
+ stub_request(:any, /#{Regexp.escape(other_host)}/).to_rack(other_app)
+ end
+
+ it "must spider the website starting at the given URL" do
+ agent = subject.start_at(url)
+
+ expect(agent.history).to be == Set[
+ URI("http://#{host}/entry-point"),
+ URI("http://#{host}/link1"),
+ URI("http://#{other_host}/offsite-link"),
+ URI("http://#{host}/link2")
+ ]
+ end
+ end
+
+ describe ".site" do
+ module TestAgentSite
+ class ExampleApp < Sinatra::Base
+
+ set :host, 'example.com'
+ set :port, 80
+
+ get '/' do
+ '<html><body>should not get here</body></html>'
+ end
+
+ get '/entry-point' do
+ <<~HTML
+ <html>
+ <body>
+ <a href="/link1">link1</a>
+ <a href="http://other.com/offsite-link">offsite link</a>
+ <a href="/link2">link2</a>
+ </body>
+ </html>
+ HTML
+ end
+
+ get '/link1' do
+ '<html><body>got here</body></html>'
+ end
+
+ get '/link2' do
+ '<html><body>got here</body></html>'
+ end
+
+ end
+ end
+
+ subject { described_class }
+
+ let(:host) { 'example.com' }
+ let(:url) { URI("http://#{host}/entry-point") }
+
+ let(:app) { TestAgentSite::ExampleApp }
+
+ before do
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
+ end
+
+ it "must spider the website starting at the given URL" do
+ agent = subject.site(url)
+
+ expect(agent.history).to be == Set[
+ URI("http://#{host}/entry-point"),
+ URI("http://#{host}/link1"),
+ URI("http://#{host}/link2")
+ ]
+ end
+ end
+
+ describe ".host" do
+ module TestAgentHost
+ class ExampleApp < Sinatra::Base
+
+ set :host, 'example.com'
+ set :port, 80
+
+ get '/' do
+ <<~HTML
+ <html>
+ <body>
+ <a href="/link1">link1</a>
+ <a href="http://other.com/offsite-link">offsite link</a>
+ <a href="/link2">link2</a>
+ </body>
+ </html>
+ HTML
+ end
+
+ get '/link1' do
+ '<html><body>got here</body></html>'
+ end
+
+ get '/link2' do
+ '<html><body>got here</body></html>'
+ end
+
+ end
+ end
+
+ subject { described_class }
+
+ let(:host) { 'example.com' }
+ let(:app) { TestAgentHost::ExampleApp }
+
+ before do
+ stub_request(:any, /#{Regexp.escape(host)}/).to_rack(app)
+ end
+
+ it "must spider the website starting at the given URL" do
+ agent = subject.host(host)
+
+ # XXX: for some reason Set#== was returning false, so convert to an Array
+ expect(agent.history.to_a).to be == [
+ URI("http://#{host}/"),
+ URI("http://#{host}/link1"),
+ URI("http://#{host}/link2")
+ ]
+ end
+ end
+
+ describe ".domain" do
+ module TestAgentDomain
+ class ExampleApp < Sinatra::Base
+
+ set :host, 'example.com'
+ set :port, 80
+
+ get '/' do
+ <<~HTML
+ <html>
+ <body>
+ <a href="/link1">link1</a>
+ <a href="http://sub.example.com/subdomain-link">subdomain link</a>
+ <a href="/link2">link2</a>
+ </body>
+ </html>
+ HTML
+ end
+
+ get '/link1' do
+ '<html><body>got here</body></html>'
+ end
+
+ get '/link2' do
+ '<html><body>got here</body></html>'
+ end
+
+ end
+
+ class SubDomainApp < Sinatra::Base
+
+ set :host, 'sub.example.com'
+ set :port, 80
+
+ get '/subdomain-link' do
+ '<html><body>should get here</body></html>'
+ end
+
+ end
+ end
+
+ subject { described_class }
+
+ let(:domain) { 'example.com' }
+ let(:domain_app) { TestAgentDomain::ExampleApp }
+
+ let(:subdomain) { 'sub.example.com' }
+ let(:subdomain_app) { TestAgentDomain::SubDomainApp }
+
+ before do
+ stub_request(:any, /#{Regexp.escape(subdomain)}/).to_rack(subdomain_app)
+ stub_request(:any, /#{Regexp.escape(domain)}/).to_rack(domain_app)
+ end
+
+ it "must spider the domain and subdomains starting at the given domain" do
+ agent = subject.domain(domain)
+
+ # XXX: for some reason Set#== was returning false, so convert to an Array
+ expect(agent.history.to_a).to be == [
+ URI("http://#{domain}/"),
+ URI("http://#{domain}/link1"),
+ URI("http://#{subdomain}/subdomain-link"),
+ URI("http://#{domain}/link2")
+ ]
+ end
+ end
+
describe "#initialize" do
it "should not be running" do
expect(subject).to_not be_running
end
@@ -30,10 +274,72 @@
it "should initialize the #session_cache" do
expect(subject.sessions).to be_kind_of(SessionCache)
end
+ context "when the proxy: keyword argument is given" do
+ let(:proxy) do
+ Spidr::Proxy.new(host: 'example.com')
+ end
+
+ subject { described_class.new(proxy: proxy) }
+
+ it "must initialize the #proxy of #session_cache" do
+ expect(subject.sessions.proxy).to be(proxy)
+ end
+ end
+
+ context "when the open_timeout: keyword argument is given" do
+ let(:open_timeout) { 5 }
+
+ subject { described_class.new(open_timeout: open_timeout) }
+
+ it "must initialize the #open_timeout of #session_cache" do
+ expect(subject.sessions.open_timeout).to eq(open_timeout)
+ end
+ end
+
+ context "when the ssl_timeout: keyword argument is given" do
+ let(:ssl_timeout) { 5 }
+
+ subject { described_class.new(ssl_timeout: ssl_timeout) }
+
+ it "must initialize the #ssl_timeout of #session_cache" do
+ expect(subject.sessions.ssl_timeout).to eq(ssl_timeout)
+ end
+ end
+
+ context "when the read_timeout: keyword argument is given" do
+ let(:read_timeout) { 5 }
+
+ subject { described_class.new(read_timeout: read_timeout) }
+
+ it "must initialize the #read_timeout of #session_cache" do
+ expect(subject.sessions.read_timeout).to eq(read_timeout)
+ end
+ end
+
+ context "when the continue_timeout: keyword argument is given" do
+ let(:continue_timeout) { 5 }
+
+ subject { described_class.new(continue_timeout: continue_timeout) }
+
+ it "must initialize the #continue_timeout of #session_cache" do
+ expect(subject.sessions.continue_timeout).to eq(continue_timeout)
+ end
+ end
+
+ context "when the keep_alive_timeout: keyword argument is given" do
+ let(:keep_alive_timeout) { 5 }
+
+ subject { described_class.new(keep_alive_timeout: keep_alive_timeout) }
+
+ it "must initialize the #keep_alive_timeout of #session_cache" do
+ expect(subject.sessions.keep_alive_timeout).to eq(keep_alive_timeout)
+ end
+ end
+
it "should initialize the #cookie_jar" do
expect(subject.cookies).to be_kind_of(CookieJar)
end
it "should initialize the #auth_store" do
@@ -384,11 +690,19 @@
end
context "frames" do
app do
get '/' do
- %{<html><body><frameset><frame src="/frame" /></frameset></body></html>}
+ <<~HTML
+ <html>
+ <body>
+ <frameset>
+ <frame src="/frame" />
+ </frameset>
+ </body>
+ </html>
+ HTML
end
get '/frame' do
%{<html><body><a href="/link">link</a></body></html>}
end
@@ -612,11 +926,18 @@
get '/' do
%{<html><body><a href="/redirect">redirect</a></body></html>}
end
get '/redirect' do
- %{<html><head><meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" /></head><body>Redirecting...</body></html>}
+ <<~HTML
+ <html>
+ <head>
+ <meta http-equiv="refresh" content="0; url=http://#{settings.host}/link" />
+ </head>
+ <body>Redirecting...</body>
+ </html>
+ HTML
end
get '/link' do
%{<html><body>got here</body></html>}
end
@@ -672,11 +993,18 @@
subject { described_class.new(host: host) }
app do
get '/' do
- %{<html><body><a href="http://google.com/">external link</a> <a href="/link">local link</a></body></html>}
+ <<~HTML
+ <html>
+ <body>
+ <a href="http://google.com/">external link</a>
+ <a href="/link">local link</a>
+ </body>
+ </html>
+ HTML
end
get '/link' do
%{<html><body>got here</body></html>}
end
@@ -724,17 +1052,31 @@
context "when :depth is set" do
include_context "example App"
app do
get '/' do
- %{<html><body><a href="/left?d=1">left</a><a href="/right?d=1">right</a></body></html>}
+ <<~HTML
+ <html>
+ <body>
+ <a href="/left?d=1">left</a>
+ <a href="/right?d=1">right</a>
+ </body>
+ </html>
+ HTML
end
- get %r{^/left|/right} do
+ get %r{/left|/right} do
d = Integer(params['d'])
- %{<html><body><a href="/left?d=#{d+1}">left</a><a href="/right?d=#{d+1}">right</a></body></html>}
+ <<~HTML
+ <html>
+ <body>
+ <a href="/left?d=#{d+1}">left</a>
+ <a href="/right?d=#{d+1}">right</a>
+ </body>
+ </html>
+ HTML
end
end
context "depth 0" do
subject { described_class.new(host: host, max_depth: 0) }
@@ -772,10 +1114,17 @@
)
end
app do
get '/' do
- %{<html><body><a href="/secret">don't follow this link</a> <a href="/pub">follow this link</a></body></html>}
+ <<~HTML
+ <html>
+ <body>
+ <a href="/secret">don't follow this link</a>
+ <a href="/pub">follow this link</a>
+ </body>
+ </html>
+ HTML
end
get '/pub' do
%{<html><body>got here</body></html>}
end