lib/spider.rb in arachni-0.2.4 vs lib/spider.rb in arachni-0.3

- old
+ new

@@ -6,12 +6,13 @@ this program under the term of the GPL v2.0 License (See LICENSE file for details) =end -require Arachni::Options.instance.dir['lib'] + 'anemone' require Arachni::Options.instance.dir['lib'] + 'module/utilities' +require 'nokogiri' +require Arachni::Options.instance.dir['lib'] + 'nokogiri/xml/node' module Arachni # # Spider class @@ -19,11 +20,11 @@ # Crawls the URL in opts[:url] and grabs the HTML code and headers. # # @author: Tasos "Zapotek" Laskos # <tasos.laskos@gmail.com> # <zapotek@segfault.gr> -# @version: 0.1 +# @version: 0.2 # class Spider include Arachni::UI::Output include Arachni::Module::Utilities @@ -32,12 +33,10 @@ # # @return [Options] # attr_reader :opts - attr_reader :pages - # # Sitemap, array of links # # @return [Array] # @@ -57,35 +56,10 @@ # @param [Options] opts # def initialize( opts ) @opts = opts - @anemone_opts = { - :threads => 1, - :discard_page_bodies => false, - :delay => 0, - :obey_robots_txt => false, - :depth_limit => false, - :link_count_limit => false, - :redirect_limit => false, - :storage => nil, - :cookies => nil, - :accept_cookies => true, - :proxy_addr => nil, - :proxy_port => nil, - :proxy_user => nil, - :proxy_pass => nil - } - - hash_opts = @opts.to_h - @anemone_opts.each_pair { - |k, v| - @anemone_opts[k] = hash_opts[k.to_s] if hash_opts[k.to_s] - } - - @anemone_opts = @anemone_opts.merge( hash_opts ) - @sitemap = [] @on_every_page_blocks = [] # if we have no 'include' patterns create one that will match # everything, like '.*' @@ -100,88 +74,173 @@ # @return [Arachni::Parser::Page] # def run( &block ) return if @opts.link_count_limit == 0 - i = 1 - # start the crawl - Anemone.crawl( @opts.url, @anemone_opts ) { - |anemone| + paths = [] + paths << @opts.url.to_s - # apply 'exclude' patterns - anemone.skip_links_like( @opts.exclude ) if @opts.exclude + visited = [] - # apply 'include' patterns and grab matching pages - # as they are discovered - anemone.on_pages_like( @opts.include ) { - |page| + while( !paths.empty? ) + while( !paths.empty? && url = paths.pop ) + url = url_sanitize( url ) + next if skip?( url ) || !in_domain?( url ) - @pages = anemone.pages.keys || [] + wait_if_paused - url = url_sanitize( page.url.to_s ) + visited << url - # something went kaboom, tell the user and skip the page - if page.error - print_error( "[Error: " + (page.error.to_s) + "] " + url ) - print_debug_backtrace( page.error ) - next - end + opts = { + :timeout => nil, + :remove_id => true, + :async => @opts.spider_first + } - # push the url in the sitemap - @sitemap.push( url ) + Arachni::HTTP.instance.get( url, opts ).on_complete { + |res| - print_line - print_status( "[HTTP: #{page.code}] " + url ) + print_line + print_status( "[HTTP: #{res.code}] " + res.effective_url ) - # call the block...if we have one - if block - exception_jail{ - new_page = Arachni::Parser.new( @opts, - Typhoeus::Response.new( - :effective_url => url, - :body => page.body, - :headers_hash => page.headers - ) - ).run - new_page.code = page.code - new_page.method = 'GET' - block.call( new_page.clone ) + page = Arachni::Parser.new( @opts, res ).run + page.url = url_sanitize( res.effective_url ) + + @sitemap |= page.paths.map { |path| url_sanitize( path ) } + paths |= @sitemap - visited + + + # call the block...if we have one + if block + exception_jail{ + block.call( page.clone ) + } + end + + # run blocks specified later + @on_every_page_blocks.each { + |block| + block.call( page ) } - end - # run blocks specified later - @on_every_page_blocks.each { - |block| - block.call( page ) } - # we don't need the HTML doc anymore - page.discard_doc!( ) + Arachni::HTTP.instance.run if !@opts.spider_first # make sure we obey the link count limit and # return if we have exceeded it. if( @opts.link_count_limit && - @opts.link_count_limit <= i ) + @opts.link_count_limit <= visited.size ) + Arachni::HTTP.instance.run if @opts.spider_first return @sitemap.uniq end - i+=1 - } - } + end + + if @opts.spider_first + Arachni::HTTP.instance.run + else + break + end + + end + return @sitemap.uniq end + def skip?( url ) + @opts.exclude.each { + |regexp| + return true if regexp =~ url + } + + @opts.redundant.each_with_index { + |redundant, i| + + if( url =~ redundant['regexp'] ) + + if( @opts.redundant[i]['count'] == 0 ) + print_verbose( 'Discarding redundant page: \'' + url + '\'' ) + return true + end + + print_info( 'Matched redundancy rule: ' + + redundant['regexp'].to_s + ' for page \'' + + url + '\'' ) + + print_info( 'Count-down: ' + @opts.redundant[i]['count'].to_s ) + + @opts.redundant[i]['count'] -= 1 + end + } + + + skip_cnt = 0 + @opts.include.each { + |regexp| + skip_cnt += 1 if !(regexp =~ url) + } + + return false if skip_cnt > 1 + + return false + end + + def wait_if_paused + while( paused? ) + ::IO::select( nil, nil, nil, 1 ) + end + end + + def pause! + @pause = true + end + + def resume! + @pause = false + end + + def paused? + @pause ||= false + return @pause + end + # - # Decodes URLs to reverse multiple encodes and removes NULL characters + # Checks if the uri is in the same domain # - def url_sanitize( url ) + # @param [URI] url + # + # @return [String] + # + def in_domain?( uri ) - while( url =~ /%/ ) - url = ( URI.decode( url ).to_s.unpack( 'A*' )[0] ) + uri_1 = URI( uri.to_s ) + uri_2 = URI( @opts.url.to_s ) + + if( @opts.follow_subdomains ) + return extract_domain( uri_1 ) == extract_domain( uri_2 ) end - return url + uri_1.host == uri_2.host + end + + # + # Extracts the domain from a URI object + # + # @param [URI] url + # + # @return [String] + # + def extract_domain( url ) + + if !url.host then return false end + + splits = url.host.split( /\./ ) + + if splits.length == 1 then return true end + + splits[-2] + "." + splits[-1] end # # Hook for further analysis of pages, statistics etc.