lib/spider.rb in arachni-0.2.4 vs lib/spider.rb in arachni-0.3
- old
+ new
@@ -6,12 +6,13 @@
this program under the term of the GPL v2.0 License
(See LICENSE file for details)
=end
-require Arachni::Options.instance.dir['lib'] + 'anemone'
require Arachni::Options.instance.dir['lib'] + 'module/utilities'
+require 'nokogiri'
+require Arachni::Options.instance.dir['lib'] + 'nokogiri/xml/node'
module Arachni
#
# Spider class
@@ -19,11 +20,11 @@
# Crawls the URL in opts[:url] and grabs the HTML code and headers.
#
# @author: Tasos "Zapotek" Laskos
# <tasos.laskos@gmail.com>
# <zapotek@segfault.gr>
-# @version: 0.1
+# @version: 0.2
#
class Spider
include Arachni::UI::Output
include Arachni::Module::Utilities
@@ -32,12 +33,10 @@
#
# @return [Options]
#
attr_reader :opts
- attr_reader :pages
-
#
# Sitemap, array of links
#
# @return [Array]
#
@@ -57,35 +56,10 @@
# @param [Options] opts
#
def initialize( opts )
@opts = opts
- @anemone_opts = {
- :threads => 1,
- :discard_page_bodies => false,
- :delay => 0,
- :obey_robots_txt => false,
- :depth_limit => false,
- :link_count_limit => false,
- :redirect_limit => false,
- :storage => nil,
- :cookies => nil,
- :accept_cookies => true,
- :proxy_addr => nil,
- :proxy_port => nil,
- :proxy_user => nil,
- :proxy_pass => nil
- }
-
- hash_opts = @opts.to_h
- @anemone_opts.each_pair {
- |k, v|
- @anemone_opts[k] = hash_opts[k.to_s] if hash_opts[k.to_s]
- }
-
- @anemone_opts = @anemone_opts.merge( hash_opts )
-
@sitemap = []
@on_every_page_blocks = []
# if we have no 'include' patterns create one that will match
# everything, like '.*'
@@ -100,88 +74,173 @@
# @return [Arachni::Parser::Page]
#
def run( &block )
return if @opts.link_count_limit == 0
- i = 1
- # start the crawl
- Anemone.crawl( @opts.url, @anemone_opts ) {
- |anemone|
+ paths = []
+ paths << @opts.url.to_s
- # apply 'exclude' patterns
- anemone.skip_links_like( @opts.exclude ) if @opts.exclude
+ visited = []
- # apply 'include' patterns and grab matching pages
- # as they are discovered
- anemone.on_pages_like( @opts.include ) {
- |page|
+ while( !paths.empty? )
+ while( !paths.empty? && url = paths.pop )
+ url = url_sanitize( url )
+ next if skip?( url ) || !in_domain?( url )
- @pages = anemone.pages.keys || []
+ wait_if_paused
- url = url_sanitize( page.url.to_s )
+ visited << url
- # something went kaboom, tell the user and skip the page
- if page.error
- print_error( "[Error: " + (page.error.to_s) + "] " + url )
- print_debug_backtrace( page.error )
- next
- end
+ opts = {
+ :timeout => nil,
+ :remove_id => true,
+ :async => @opts.spider_first
+ }
- # push the url in the sitemap
- @sitemap.push( url )
+ Arachni::HTTP.instance.get( url, opts ).on_complete {
+ |res|
- print_line
- print_status( "[HTTP: #{page.code}] " + url )
+ print_line
+ print_status( "[HTTP: #{res.code}] " + res.effective_url )
- # call the block...if we have one
- if block
- exception_jail{
- new_page = Arachni::Parser.new( @opts,
- Typhoeus::Response.new(
- :effective_url => url,
- :body => page.body,
- :headers_hash => page.headers
- )
- ).run
- new_page.code = page.code
- new_page.method = 'GET'
- block.call( new_page.clone )
+ page = Arachni::Parser.new( @opts, res ).run
+ page.url = url_sanitize( res.effective_url )
+
+ @sitemap |= page.paths.map { |path| url_sanitize( path ) }
+ paths |= @sitemap - visited
+
+
+ # call the block...if we have one
+ if block
+ exception_jail{
+ block.call( page.clone )
+ }
+ end
+
+ # run blocks specified later
+ @on_every_page_blocks.each {
+ |block|
+ block.call( page )
}
- end
- # run blocks specified later
- @on_every_page_blocks.each {
- |block|
- block.call( page )
}
- # we don't need the HTML doc anymore
- page.discard_doc!( )
+ Arachni::HTTP.instance.run if !@opts.spider_first
# make sure we obey the link count limit and
# return if we have exceeded it.
if( @opts.link_count_limit &&
- @opts.link_count_limit <= i )
+ @opts.link_count_limit <= visited.size )
+ Arachni::HTTP.instance.run if @opts.spider_first
return @sitemap.uniq
end
- i+=1
- }
- }
+ end
+
+ if @opts.spider_first
+ Arachni::HTTP.instance.run
+ else
+ break
+ end
+
+ end
+
return @sitemap.uniq
end
+ def skip?( url )
+ @opts.exclude.each {
+ |regexp|
+ return true if regexp =~ url
+ }
+
+ @opts.redundant.each_with_index {
+ |redundant, i|
+
+ if( url =~ redundant['regexp'] )
+
+ if( @opts.redundant[i]['count'] == 0 )
+ print_verbose( 'Discarding redundant page: \'' + url + '\'' )
+ return true
+ end
+
+ print_info( 'Matched redundancy rule: ' +
+ redundant['regexp'].to_s + ' for page \'' +
+ url + '\'' )
+
+ print_info( 'Count-down: ' + @opts.redundant[i]['count'].to_s )
+
+ @opts.redundant[i]['count'] -= 1
+ end
+ }
+
+
+ skip_cnt = 0
+ @opts.include.each {
+ |regexp|
+ skip_cnt += 1 if !(regexp =~ url)
+ }
+
+ return false if skip_cnt > 1
+
+ return false
+ end
+
+ def wait_if_paused
+ while( paused? )
+ ::IO::select( nil, nil, nil, 1 )
+ end
+ end
+
+ def pause!
+ @pause = true
+ end
+
+ def resume!
+ @pause = false
+ end
+
+ def paused?
+ @pause ||= false
+ return @pause
+ end
+
#
- # Decodes URLs to reverse multiple encodes and removes NULL characters
+ # Checks if the uri is in the same domain
#
- def url_sanitize( url )
+ # @param [URI] url
+ #
+ # @return [String]
+ #
+ def in_domain?( uri )
- while( url =~ /%/ )
- url = ( URI.decode( url ).to_s.unpack( 'A*' )[0] )
+ uri_1 = URI( uri.to_s )
+ uri_2 = URI( @opts.url.to_s )
+
+ if( @opts.follow_subdomains )
+ return extract_domain( uri_1 ) == extract_domain( uri_2 )
end
- return url
+ uri_1.host == uri_2.host
+ end
+
+ #
+ # Extracts the domain from a URI object
+ #
+ # @param [URI] url
+ #
+ # @return [String]
+ #
+ def extract_domain( url )
+
+ if !url.host then return false end
+
+ splits = url.host.split( /\./ )
+
+ if splits.length == 1 then return true end
+
+ splits[-2] + "." + splits[-1]
end
#
# Hook for further analysis of pages, statistics etc.