require 'spec_helper' describe Arachni::Spider do before( :all ) do @opts = Arachni::Options.instance @opts.url = web_server_url_for :spider @url = @opts.url.to_s end before( :each ) do reset_options @opts.url = @url Arachni::HTTP.instance.reset end it 'supports HTTPS' do @opts.url = (web_server_url_for :spider_https).gsub( 'http', 'https' ) spider = Arachni::Spider.new spider.run.size.should == 3 spider.redirects.size.should == 2 end it 'avoids infinite loops' do @opts.url = @url + 'loop' sitemap = Arachni::Spider.new.run expected = [ @opts.url, @opts.url + '_back' ] (sitemap & expected).sort.should == expected.sort end it 'preserves cookies' do @opts.url = @url + 'with_cookies' Arachni::Spider.new.run. include?( @url + 'with_cookies3' ).should be_true end it 'ignores redirections to foreign domains' do @opts.url = @url + 'foreign_domain' Arachni::Spider.new.run.should == [ @opts.url ] end context 'when unable to get a response for the given URL' do context 'due to a network error' do it 'returns an empty sitemap and have failures' do @opts.url = 'http://blahaha' s = Arachni::Spider.new( @opts ) s.url.should == @opts.url s.run.should be_empty s.failures.should be_any end end context 'due to a server error' do it 'returns an empty sitemap and have failures' do @opts.url = @url + '/fail' s = Arachni::Spider.new( @opts ) s.url.should == @opts.url s.run.should be_empty s.failures.should be_any end end it "retries #{Arachni::Spider::MAX_TRIES} times" do @opts.url = @url + '/fail_4_times' s = Arachni::Spider.new( @opts ) s.url.should == @opts.url s.run.should be_any end end describe '#failures' do context 'when there are no failed requests' do it 'returns an empty array' do s = Arachni::Spider.new( @opts ) s.run.should be_any s.failures.should be_empty end end context 'when there are failed requests' do it 'returns an array containing the failed URLs' do @opts.url = 'http://blahaha/' s = Arachni::Spider.new( @opts ) s.url.should == @opts.url s.run.should be_empty s.failures.should be_any s.failures.should include( @opts.url ) end end end describe '.new' do it 'initializes it using the passed options' do Arachni::Spider.new( @opts ).url.should == @url end context 'when called without params' do it 'defaults to Arachni::Options.instance' do Arachni::Spider.new.url.should == @url end end context 'when the option has been set' do it 'adds those paths to be followed' do @opts.extend_paths = %w(some_path) s = Arachni::Spider.new s.paths.sort.should == ([@url] | [@url + @opts.extend_paths.first]).sort end end end describe '#opts' do it 'returns the init options' do Arachni::Spider.new.opts.should == @opts end end describe '#redirects' do it 'holds an array of requested URLs that caused a redirect' do @opts.url = @url + 'redirect' s = Arachni::Spider.new s.run s.redirects.should == [ s.url ] end end describe '#url' do it 'returns the seed URL' do Arachni::Spider.new.url.should == @url end end describe '#sitemap' do context 'when just initialized' do it 'is empty' do Arachni::Spider.new.sitemap.should be_empty end end context 'after a crawl' do it 'returns a list of crawled URLs' do s = Arachni::Spider.new s.run s.sitemap.include?( @url ).should be_true end end end describe '#fancy_sitemap' do context 'when just initialized' do it 'is empty' do spider = Arachni::Spider.new spider.fancy_sitemap.should be_empty end end context 'after a crawl' do it 'returns a hash of crawled URLs with their HTTP response codes' do spider = Arachni::Spider.new spider.run spider.fancy_sitemap.include?( @url ).should be_true spider.fancy_sitemap[@url].should == 200 spider.fancy_sitemap[@url + 'this_does_not_exist' ].should == 404 end end end describe '#run' do it 'performs the crawl' do @opts.url = @url + '/lots_of_paths' spider = Arachni::Spider.new spider.run.size.should == 10051 end it 'ignores path parameters' do @opts.url = @url + '/path_params' spider = Arachni::Spider.new spider.run.select { |url| url.include?( '/something' ) }.size.should == 1 end context 'Options.exclude_pages' do it 'skips pages which match the configured patterns (but not the seed URL)' do @opts.exclude_pages = /skip-me/i @opts.url = "#{@url}skip" Arachni::Spider.new.run.should == [@opts.url, "#{@url}follow-me"] end end context 'Options.exclude' do it 'skips paths which match the configured patterns (but not the seed URL)' do @opts.exclude = /skip-me/i @opts.url = "#{@url}skip" Arachni::Spider.new.run.should == [@opts.url, "#{@url}follow-me"] end end context 'Options.include' do it 'skips paths which do not match the configured patterns (but not the seed URL)' do @opts.include = /include-me/i @opts.url = "#{@url}include" Arachni::Spider.new.run.sort.should == [@opts.url, "#{@url}include-me/1", "#{@url}include-me/2"].sort end end context 'Options.do_not_crawl' do it 'does not crawl the site' do @opts.do_not_crawl Arachni::Spider.new.run.should be_nil end context 'when crawling is then enabled using Options.crawl' do it 'performs a crawl' do @opts.crawl Arachni::Spider.new.run.should be_any end end end context 'Options.auto_redundant' do describe 5 do it 'only crawls 5 URLs with identical query parameter names' do @opts.auto_redundant = 5 @opts.url += 'auto-redundant' Arachni::Spider.new.run.size.should == 11 end end end context 'when the link-count-limit option has been set' do context 'and the limit has been reached' do it 'immediately returns' do @opts.link_count_limit = 1 spider = Arachni::Spider.new spider.run.should == spider.sitemap spider.sitemap.should == [@url] spider.run.should be_false end end it 'follows only a amount of paths' do @opts.link_count_limit = 1 spider = Arachni::Spider.new spider.run.should == spider.sitemap spider.sitemap.should == [@url] @opts.link_count_limit = 2 spider = Arachni::Spider.new spider.run.should == spider.sitemap spider.sitemap.size.should == 2 end end context 'when redundant rules have been set' do it 'follows the matching paths the specified amounts of time' do @opts.url = @url + '/redundant' @opts.redundant = { 'redundant' => 2 } spider = Arachni::Spider.new spider.run.select { |url| url.include?( 'redundant' ) }.size.should == 2 @opts.redundant = { 'redundant' => 3 } spider = Arachni::Spider.new spider.run.select { |url| url.include?( 'redundant' ) }.size.should == 3 end end context 'when called without parameters' do it 'performs a crawl and return the sitemap' do spider = Arachni::Spider.new spider.run.should == spider.sitemap spider.sitemap.should be_any end end context 'when called with a block only' do it 'passes the block each page as visited' do spider = Arachni::Spider.new pages = [] spider.run { |page| pages << page } pages.size.should == spider.sitemap.size pages.first.is_a?( Arachni::Page ).should be_true end end context 'when a redirect that is outside the scope is encountered' do it 'is ignored' do @opts.url = @url + '/skip_redirect' spider = Arachni::Spider.new spider.run.should be_empty spider.redirects.size.should == 1 end end it 'follows relative redirect locations' do @opts.url = @url + '/relative_redirect' @opts.redirect_limit = -1 spider = Arachni::Spider.new spider.run.select { |url| url.include?( 'stacked_redirect4' ) }.should be_any end it 'follows stacked redirects' do @opts.url = @url + '/stacked_redirect' @opts.redirect_limit = -1 spider = Arachni::Spider.new spider.run.select { |url| url.include?( 'stacked_redirect4' ) }.should be_any end it 'ignores stacked redirects that exceed the limit' do @opts.url = @url + '/stacked_redirect' @opts.redirect_limit = 3 spider = Arachni::Spider.new spider.run.size.should == 3 end context 'when called with options and a block' do describe :pass_pages_to_block do describe true do it 'passes the block each page as visited' do spider = Arachni::Spider.new pages = [] spider.run( true ) { |page| pages << page } pages.size.should == spider.sitemap.size pages.first.is_a?( Arachni::Page ).should be_true end end describe false do it 'passes the block each HTTP response as received' do spider = Arachni::Spider.new responses = [] spider.run( false ) { |res| responses << res } responses.size.should == spider.sitemap.size responses.first.is_a?( Typhoeus::Response ).should be_true end end end end end describe '#on_each_page' do it 'is passed each page as visited' do pages = [] pages2 = [] s = Arachni::Spider.new s.on_each_page { |page| pages << page }.should == s s.on_each_page { |page| pages2 << page }.should == s s.run pages.should == pages2 pages.size.should == s.sitemap.size pages.first.is_a?( Arachni::Page ).should be_true end end describe '#on_each_response' do it 'is passed each response as received' do responses = [] responses2 = [] s = Arachni::Spider.new s.on_each_response { |response| responses << response }.should == s s.on_each_response { |response| responses2 << response }.should == s s.run responses.should == responses2 responses.size.should == s.sitemap.size responses.first.is_a?( Typhoeus::Response ).should be_true end end describe '#on_complete' do it 'is called once the crawl it done' do s = Arachni::Spider.new called = false called2 = false s.on_complete { called = true }.should == s s.on_complete { called2 = true }.should == s s.run called.should == called2 called.should be_true end end describe '#push' do it 'pushes paths for the crawler to follow' do s = Arachni::Spider.new path = @url + 'a_pushed_path' s.push( path ) s.paths.include?( path ).should be_true s.run s.paths.include?( path ).should be_false s.sitemap.include?( path ).should be_true s = Arachni::Spider.new paths = [@url + 'a_pushed_path', @url + 'another_pushed_path'] s.push( paths ).should be_true (s.paths & paths).sort.should == paths.sort s.run (s.paths & paths).should be_empty (s.sitemap & paths).sort.should == paths.sort end it 'normalizes and follow the pushed paths' do s = Arachni::Spider.new p = 'some-path blah! %&$' wp = 'another weird path %"&*[$)' nwp = Arachni::Module::Utilities.to_absolute( wp ) np = Arachni::Module::Utilities.to_absolute( p ) s.push( p ).should be_true s.run s.fancy_sitemap[np].should == 200 s.fancy_sitemap[nwp].should == 200 end context 'when the link-count-limit option has been set' do context 'and the limit has been reached' do it 'immediately returns' do @opts.link_count_limit = 1 spider = Arachni::Spider.new spider.run.should == spider.sitemap spider.sitemap.should == [@url] spider.push( Arachni::Module::Utilities.to_absolute( 'test' ) ).should be_false end end end end describe '#done?' do context 'when not running' do it 'returns false' do s = Arachni::Spider.new s.done?.should be_false end end context 'when running' do it 'returns false' do s = Arachni::Spider.new Thread.new{ s.run } s.done?.should be_false end end context 'when it has finished' do it 'returns true' do s = Arachni::Spider.new s.run s.done?.should be_true end end end describe '#running?' do context 'when not running' do it 'returns false' do s = Arachni::Spider.new s.running?.should be_false end end context 'when running' do it 'returns false' do @opts.url = web_server_url_for( :auditor ) + '/sleep' s = Arachni::Spider.new Thread.new{ s.run } sleep 1 s.running?.should be_true end end context 'when it has finished' do it 'returns true' do s = Arachni::Spider.new s.run s.running?.should be_false end end end describe '#pause' do it 'pauses a running crawl' do s = Arachni::Spider.new Thread.new{ s.run } s.pause sleep 1 s.sitemap.should be_empty end end describe '#paused?' do context 'when the crawl is not paused' do it 'returns false' do s = Arachni::Spider.new s.paused?.should be_false end end context 'when the crawl is paused' do it 'returns true' do s = Arachni::Spider.new s.pause s.paused?.should be_true end end end describe '#resume' do it 'resumes a paused crawl' do @opts.url = @url + 'sleep' s = Arachni::Spider.new s.pause Thread.new{ s.run } sleep 1 s.sitemap.should be_empty s.done?.should be_false s.resume sleep 0.1 while !s.done? s.sitemap.should be_any s.done?.should be_true end end end