require 'spec_helper' require 'ronin/web/spider/agent' require 'webmock/rspec' require 'sinatra/base' describe Ronin::Web::Spider::Agent do describe "#initialize" do context "when Ronin::Support::Network::HTTP.proxy is set" do let(:proxy_host) { 'example.com' } let(:proxy_port) { 8080 } let(:proxy_uri) { URI::HTTP.build(host: proxy_host, port: proxy_port) } before { Ronin::Support::Network::HTTP.proxy = proxy_uri } it "must parse ENV['RONIN_HTTP_USER_AGENT'] and set #proxy" do expect(subject.proxy).to be_kind_of(Spidr::Proxy) expect(subject.proxy.host).to eq(proxy_host) expect(subject.proxy.port).to eq(proxy_port) end after { Ronin::Support::Network::HTTP.proxy = nil } end context "when Ronin::Support::Network::HTTP.user_agent is set" do let(:user_agent) { 'Foo Bar' } before { Ronin::Support::Network::HTTP.user_agent = user_agent } it "must default #user_agent to ENV['RONIN_HTTP_USER_AGENT']" do expect(subject.user_agent).to eq(user_agent) end after { Ronin::Support::Network::HTTP.user_agent = nil } end context "when given the proxy: keyword argument" do let(:proxy_host) { 'example.com' } let(:proxy_port) { 8080 } context "and it's an Addressable::URI" do let(:proxy) { Addressable::URI.new(host: proxy_host, port: proxy_port) } subject { described_class.new(proxy: proxy) } it "must convert it to a Spidr::Proxy object" do expect(subject.proxy).to be_kind_of(Spidr::Proxy) expect(subject.proxy.host).to eq(proxy_host) expect(subject.proxy.port).to eq(proxy_port) end end context "and it's an URI::HTTP" do let(:proxy) { URI::HTTP.build(host: proxy_host, port: proxy_port) } subject { described_class.new(proxy: proxy) } it "must convert it to a Spidr::Proxy object" do expect(subject.proxy).to be_kind_of(Spidr::Proxy) expect(subject.proxy.host).to eq(proxy_host) expect(subject.proxy.port).to eq(proxy_port) end end context "and it's a Hash" do let(:proxy) do {host: proxy_host, port: proxy_port} end subject { described_class.new(proxy: proxy) } it "must convert it to a Spidr::Proxy object" do expect(subject.proxy).to be_kind_of(Spidr::Proxy) expect(subject.proxy.host).to eq(proxy_host) expect(subject.proxy.port).to eq(proxy_port) end end context "and it's a String" do let(:proxy) { "http://#{proxy_host}:#{proxy_port}" } subject { described_class.new(proxy: proxy) } it "must convert it to a Spidr::Proxy object" do expect(subject.proxy).to be_kind_of(Spidr::Proxy) expect(subject.proxy.host).to eq(proxy_host) expect(subject.proxy.port).to eq(proxy_port) end end end context "when given the user_agent: keyword argument" do context "and it's a String" do let(:user_agent) { "test user-agent" } subject { described_class.new(user_agent: user_agent) } it "must set the #user_agent" do expect(subject.user_agent).to eq(user_agent) end end context "and it's a Symbol" do let(:user_agent) { :chrome_linux } let(:expected_user_agent) do Ronin::Support::Network::HTTP::UserAgents[user_agent] end subject { described_class.new(user_agent: user_agent) } it "must map the Symbol to one of Ronin::Support::Network::HTTP::UserAgents" do expect(subject.user_agent).to eq(expected_user_agent) end end end it "must default #visited_hosts to nil" do expect(subject.visited_hosts).to be(nil) end end describe "#every_host" do module TestAgentEveryHost class Host1 < Sinatra::Base set :host, 'host1.example.com' set :port, 80 get '/' do <<~HTML
link1 offsite link link2 HTML end get '/link1' do 'got here' end get '/link2' do 'got here' end end class Host2 < Sinatra::Base set :host, 'host2.example.com' set :port, 80 get '/offsite-link' do 'should not get here' end end end let(:host1) { 'host1.example.com' } let(:host2) { 'host2.example.com' } let(:host1_app) { TestAgentEveryHost::Host1 } let(:host2_app) { TestAgentEveryHost::Host2 } before do stub_request(:any, /#{Regexp.escape(host1)}/).to_rack(host1_app) stub_request(:any, /#{Regexp.escape(host2)}/).to_rack(host2_app) end it "must yield every newly discovered hostname while spidering" do yielded_hosts = [] subject.every_host do |host| yielded_hosts << host end subject.start_at("http://#{host1}/") expect(yielded_hosts).to eq([host1, host2]) end it "must popualte #visited_hosts" do subject.every_host { |host| } subject.start_at("http://#{host1}/") expect(subject.visited_hosts).to be_kind_of(Set) expect(subject.visited_hosts.entries).to eq([host1, host2]) end end # TODO: need to figure out how to test #every_cert using webmock. describe "#every_cert" describe "#every_favicon" do module TestAgentEveryHost class TestApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do <<~HTML link1 offsite link link2 HTML end get '/favicon1.ico' do content_type 'image/x-icon' "favicon1" end get '/favicon2.ico' do content_type 'image/vnd.microsoft.icon' "favicon2" end get '/link1' do 'got here' end get '/link2' do <<~HTML got here HTML end end end let(:host) { 'example.com' } let(:test_app) { TestAgentEveryHost::TestApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app) end it "must yield Spidr::Page objects for each encountered .ico file" do yielded_favicons = [] subject.every_favicon do |favicon| yielded_favicons << favicon end subject.start_at("http://#{host}/") expect(yielded_favicons).to_not be_empty expect(yielded_favicons[0]).to be_kind_of(Spidr::Page) expect(yielded_favicons[0].content_type).to eq('image/x-icon') expect(yielded_favicons[0].url).to eq(URI("http://#{host}/favicon1.ico")) expect(yielded_favicons[1]).to be_kind_of(Spidr::Page) expect(yielded_favicons[1].content_type).to eq('image/vnd.microsoft.icon') expect(yielded_favicons[1].url).to eq(URI("http://#{host}/favicon2.ico")) end end describe "#every_html_comment" do module TestAgentEveryHTMLComment class TestApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do <<~HTML HTML end end end let(:host) { 'example.com' } let(:test_app) { TestAgentEveryHTMLComment::TestApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app) end it "must yield every non-empty/non-whitespace HTML comment String" do yielded_comments = [] subject.every_html_comment do |comment| yielded_comments << comment end subject.start_at("http://#{host}/") expect(yielded_comments).to match_array( [ 'comment 1', 'comment 2' ] ) end end describe "#every_javascript" do module TestAgentEveryJavaScript class TestApp < Sinatra::Base set :host, 'example.com' set :port, 80 get '/' do <<~HTML link1 offsite link link2 HTML end get '/javascript1.js' do content_type 'text/javascript' "javascript1" end end end let(:host) { 'example.com' } let(:test_app) { TestAgentEveryJavaScript::TestApp } before do stub_request(:any, /#{Regexp.escape(host)}/).to_rack(test_app) end it "must yield both the contents of .js files and inline link1 offsite link link2