spider.rb in arachni-0.3

- old
+ new

@@ -6,12 +6,13 @@
   this program under the term of the GPL v2.0 License
   (See LICENSE file for details)
 
 =end
 
-require Arachni::Options.instance.dir['lib'] + 'anemone'
 require Arachni::Options.instance.dir['lib'] + 'module/utilities'
+require 'nokogiri'
+require Arachni::Options.instance.dir['lib'] + 'nokogiri/xml/node'
 
 module Arachni
 
 #
 # Spider class
@@ -19,11 +20,11 @@
 # Crawls the URL in opts[:url] and grabs the HTML code and headers.
 #
 # @author: Tasos "Zapotek" Laskos
 #                                      <tasos.laskos@gmail.com>
 #                                      <zapotek@segfault.gr>
-# @version: 0.1
+# @version: 0.2
 #
 class Spider
 
     include Arachni::UI::Output
     include Arachni::Module::Utilities
@@ -32,12 +33,10 @@
     #
     # @return [Options]
     #
     attr_reader :opts
 
-    attr_reader :pages
-
     #
     # Sitemap, array of links
     #
     # @return [Array]
     #
@@ -57,35 +56,10 @@
     # @param  [Options] opts
     #
     def initialize( opts )
         @opts = opts
 
-        @anemone_opts = {
-            :threads              =>  1,
-            :discard_page_bodies  =>  false,
-            :delay                =>  0,
-            :obey_robots_txt      =>  false,
-            :depth_limit          =>  false,
-            :link_count_limit     =>  false,
-            :redirect_limit       =>  false,
-            :storage              =>  nil,
-            :cookies              =>  nil,
-            :accept_cookies       =>  true,
-            :proxy_addr           =>  nil,
-            :proxy_port           =>  nil,
-            :proxy_user           =>  nil,
-            :proxy_pass           =>  nil
-        }
-
-        hash_opts = @opts.to_h
-        @anemone_opts.each_pair {
-            |k, v|
-            @anemone_opts[k] = hash_opts[k.to_s] if hash_opts[k.to_s]
-        }
-
-        @anemone_opts = @anemone_opts.merge( hash_opts )
-
         @sitemap = []
         @on_every_page_blocks = []
 
         # if we have no 'include' patterns create one that will match
         # everything, like '.*'
@@ -100,88 +74,173 @@
     # @return [Arachni::Parser::Page]
     #
     def run( &block )
         return if @opts.link_count_limit == 0
 
-        i = 1
-        # start the crawl
-        Anemone.crawl( @opts.url, @anemone_opts ) {
-            |anemone|
+        paths = []
+        paths << @opts.url.to_s
 
-            # apply 'exclude' patterns
-            anemone.skip_links_like( @opts.exclude ) if @opts.exclude
+        visited = []
 
-            # apply 'include' patterns and grab matching pages
-            # as they are discovered
-            anemone.on_pages_like( @opts.include ) {
-                |page|
+        while( !paths.empty? )
+            while( !paths.empty? && url = paths.pop )
+                url = url_sanitize( url )
+                next if skip?( url ) || !in_domain?( url )
 
-                @pages = anemone.pages.keys || []
+                wait_if_paused
 
-                url = url_sanitize( page.url.to_s )
+                visited << url
 
-                # something went kaboom, tell the user and skip the page
-                if page.error
-                    print_error( "[Error: " + (page.error.to_s) + "] " + url )
-                    print_debug_backtrace( page.error )
-                    next
-                end
+                opts = {
+                    :timeout => nil,
+                    :remove_id => true,
+                    :async => @opts.spider_first
+                }
 
-                # push the url in the sitemap
-                @sitemap.push( url )
+                Arachni::HTTP.instance.get( url, opts ).on_complete {
+                    |res|
 
-                print_line
-                print_status( "[HTTP: #{page.code}] " + url )
+                    print_line
+                    print_status( "[HTTP: #{res.code}] " + res.effective_url )
 
-                # call the block...if we have one
-                if block
-                    exception_jail{
-                        new_page = Arachni::Parser.new( @opts,
-                            Typhoeus::Response.new(
-                                :effective_url => url,
-                                :body          => page.body,
-                                :headers_hash  => page.headers
-                            )
-                        ).run
-                        new_page.code   = page.code
-                        new_page.method = 'GET'
-                        block.call( new_page.clone )
+                    page = Arachni::Parser.new( @opts, res ).run
+                    page.url = url_sanitize( res.effective_url )
+
+                    @sitemap |= page.paths.map { |path| url_sanitize( path ) }
+                    paths    |= @sitemap - visited
+
+
+                    # call the block...if we have one
+                    if block
+                        exception_jail{
+                            block.call( page.clone )
+                        }
+                    end
+
+                    # run blocks specified later
+                    @on_every_page_blocks.each {
+                        |block|
+                        block.call( page )
                     }
-                end
 
-                # run blocks specified later
-                @on_every_page_blocks.each {
-                    |block|
-                    block.call( page )
                 }
 
-                # we don't need the HTML doc anymore
-                page.discard_doc!( )
+                Arachni::HTTP.instance.run if !@opts.spider_first
 
                 # make sure we obey the link count limit and
                 # return if we have exceeded it.
                 if( @opts.link_count_limit &&
-                    @opts.link_count_limit <= i )
+                    @opts.link_count_limit <= visited.size )
+                    Arachni::HTTP.instance.run if @opts.spider_first
                     return @sitemap.uniq
                 end
 
-                i+=1
-            }
-        }
 
+            end
+
+            if @opts.spider_first
+                Arachni::HTTP.instance.run
+            else
+                break
+            end
+
+        end
+
         return @sitemap.uniq
     end
 
+    def skip?( url )
+        @opts.exclude.each {
+            |regexp|
+            return true if regexp =~ url
+        }
+
+        @opts.redundant.each_with_index {
+            |redundant, i|
+
+            if( url =~ redundant['regexp'] )
+
+                if( @opts.redundant[i]['count'] == 0 )
+                    print_verbose( 'Discarding redundant page: \'' + url + '\'' )
+                    return true
+                end
+
+                print_info( 'Matched redundancy rule: ' +
+                redundant['regexp'].to_s + ' for page \'' +
+                url + '\'' )
+
+                print_info( 'Count-down: ' + @opts.redundant[i]['count'].to_s )
+
+                @opts.redundant[i]['count'] -= 1
+            end
+        }
+
+
+        skip_cnt = 0
+        @opts.include.each {
+            |regexp|
+            skip_cnt += 1 if !(regexp =~ url)
+        }
+
+        return false if skip_cnt > 1
+
+        return false
+    end
+
+    def wait_if_paused
+        while( paused? )
+            ::IO::select( nil, nil, nil, 1 )
+        end
+    end
+
+    def pause!
+        @pause = true
+    end
+
+    def resume!
+        @pause = false
+    end
+
+    def paused?
+        @pause ||= false
+        return @pause
+    end
+
     #
-    # Decodes URLs to reverse multiple encodes and removes NULL characters
+    # Checks if the uri is in the same domain
     #
-    def url_sanitize( url )
+    # @param [URI] url
+    #
+    # @return [String]
+    #
+    def in_domain?( uri )
 
-        while( url =~ /%/ )
-            url = ( URI.decode( url ).to_s.unpack( 'A*' )[0] )
+        uri_1 = URI( uri.to_s )
+        uri_2 = URI( @opts.url.to_s )
+
+        if( @opts.follow_subdomains )
+            return extract_domain( uri_1 ) ==  extract_domain( uri_2 )
         end
 
-        return url
+        uri_1.host == uri_2.host
+    end
+
+    #
+    # Extracts the domain from a URI object
+    #
+    # @param [URI] url
+    #
+    # @return [String]
+    #
+    def extract_domain( url )
+
+        if !url.host then return false end
+
+        splits = url.host.split( /\./ )
+
+        if splits.length == 1 then return true end
+
+        splits[-2] + "." + splits[-1]
     end
 
 
     #
     # Hook for further analysis of pages, statistics etc.