parser.rb in arachni-0.3

- old
+ new

@@ -8,13 +8,15 @@
 =end
 
 module Arachni
 
 opts = Arachni::Options.instance
+require 'webrick'
 require opts.dir['lib'] + 'parser/elements'
 require opts.dir['lib'] + 'parser/page'
 require opts.dir['lib'] + 'module/utilities'
+require opts.dir['lib'] + 'component_manager'
 
 #
 # Analyzer class
 #
 # Analyzes HTML code extracting forms, links and cookies
@@ -39,17 +41,46 @@
 # Cookies are extracted from the HTTP headers and parsed by WEBrick::Cookie
 #
 # @author: Tasos "Zapotek" Laskos
 #                                      <tasos.laskos@gmail.com>
 #                                      <zapotek@segfault.gr>
-# @version: 0.2
+# @version: 0.2.1
 #
 class Parser
-
+    include Arachni::UI::Output
     include Arachni::Module::Utilities
 
+    module Extractors
     #
+    # Base Spider parser class for modules.
+    #
+    # The aim of such modules is to extract paths from a webpage for the Spider to follow.
+    #
+    #
+    # @author: Tasos "Zapotek" Laskos
+    #                                      <tasos.laskos@gmail.com>
+    #                                      <zapotek@segfault.gr>
+    # @version: 0.1
+    # @abstract
+    #
+    class Paths
+
+        #
+        # This method must be implemented by all modules and must return an array
+        # of paths as plain strings
+        #
+        # @param    [Nokogiri]  Nokogiri document
+        #
+        # @return   [Array<String>]  paths
+        #
+        def run( doc )
+
+        end
+    end
+    end
+
+    #
     # @return    [String]    the url of the page
     #
     attr_accessor :url
 
     #
@@ -66,11 +97,11 @@
     # @param  [Options] opts
     #
     def initialize( opts, res )
         @opts = opts
 
-        @url  = res.effective_url
+        @url  = url_sanitize( res.effective_url )
         @html = res.body
         @response_headers = res.headers_hash
     end
 
     #
@@ -87,10 +118,11 @@
                 :url         => @url,
                 :query_vars  => link_vars( @url ),
                 :html        => @html,
                 :headers     => [],
                 :response_headers     => @response_headers,
+                :paths       => [],
                 :forms       => [],
                 :links       => [],
                 :cookies     => [],
                 :cookiejar   => []
             } )
@@ -106,18 +138,29 @@
         preped = {}
         cookies_arr.each{ |cookie| preped.merge!( cookie.simple ) }
 
         jar = preped.merge( jar )
 
+        c_links = links
+
+        if !( vars = link_vars( @url ) ).empty?
+            url = to_absolute( @url )
+            c_links << Arachni::Parser::Element::Link.new( url, {
+                'href' => url,
+                'vars' => vars
+            } )
+        end
+
         return Page.new( {
             :url         => @url,
             :query_vars  => link_vars( @url ),
             :html        => @html,
             :headers     => headers(),
             :response_headers     => @response_headers,
+            :paths       => paths(),
             :forms       => @opts.audit_forms ? forms() : [],
-            :links       => @opts.audit_links ? links() : [],
+            :links       => @opts.audit_links ? c_links : [],
             :cookies     => merge_with_cookiestore( merge_with_cookiejar( cookies_arr ) ),
             :cookiejar   => jar
         } )
 
     end
@@ -256,11 +299,11 @@
             elements[i]['attrs']    = form_attrs( form )
 
             if( !elements[i]['attrs'] || !elements[i]['attrs']['action'] )
                 action = @url.to_s
             else
-                action = elements[i]['attrs']['action']
+                action = url_sanitize( elements[i]['attrs']['action'] )
             end
             action = URI.escape( action ).to_s
 
             elements[i]['attrs']['action'] = to_absolute( action.clone ).to_s
 
@@ -320,12 +363,21 @@
             if !link['href'] then next end
             if( exclude?( link['href'] ) ) then next end
             if( !include?( link['href'] ) ) then next end
             if !in_domain?( URI.parse( link['href'] ) ) then next end
 
-            link['vars'] = link_vars( link['href'] )
+            link['vars'] = {}
+            link_vars( link['href'] ).each_pair {
+                |key, val|
+                begin
+                    link['vars'][key] = url_sanitize( val )
+                rescue
+                    link['vars'][key] = val
+                end
+            }
 
+            link['href'] = url_sanitize( link['href'] )
 
             link_arr << Element::Link.new( @url, link )
 
         }
 
@@ -354,15 +406,16 @@
                 cookies_arr << Element::Cookie.new( @url, { 'name' => k, 'value' => v } )
             }
         rescue
         end
 
+
         # don't ask me why....
         if @response_headers.to_s.substring?( 'set-cookie' )
             begin
-                cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s )
-                cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s )
+                cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s )
+                cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s )
             rescue
                 return cookies_arr
             end
         end
 
@@ -388,11 +441,37 @@
         }
         cookies_arr.flatten!
         return cookies_arr
     end
 
+    def dir( url )
+        URI( File.dirname( URI( url.to_s ).path ) + '/' )
+    end
+
     #
+    # Array of distinct links to follow
+    #
+    # @return   [Array<URI>]
+    #
+    def paths
+      return @paths unless @paths.nil?
+      @paths = []
+      return @paths if !doc
+
+      run_extractors( ).each {
+          |path|
+          next if path.nil? or path.empty?
+          abs = to_absolute( path ) rescue next
+
+          @paths << abs if in_domain?( abs )
+      }
+
+      @paths.uniq!
+      return @paths
+    end
+
+    #
     # Extracts variables and their values from a link
     #
     # @see #links
     #
     # @param [String]    link
@@ -432,30 +511,40 @@
             if URI.parse( link ).host
                 return link
             end
         rescue Exception => e
             return nil if link.nil?
-            #      return link
         end
 
         # remove anchor
-        link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/, '' ) )
+        link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/,'' ) )
 
-        begin
-            relative = URI(link)
-            url = URI.parse( @url )
+        if url = base
+            base_url = URI( url )
+        else
+            base_url = URI( @url )
+        end
 
-            absolute = url.merge(relative)
+        relative = URI( link )
+        absolute = base_url.merge( relative )
 
-            absolute.path = '/' if absolute.path.empty?
-        rescue Exception => e
-            return
-        end
+        absolute.path = '/' if absolute.path && absolute.path.empty?
 
         return absolute.to_s
     end
 
+
+    def base
+        begin
+            tmp = doc.search( '//base[@href]' )
+            return tmp[0]['href'].dup
+        rescue
+            return
+        end
+    end
+
+
     #
     # Returns +true+ if *uri* is in the same domain as the page, returns
     # +false+ otherwise
     #
     def in_domain?( uri )
@@ -505,9 +594,32 @@
         return false
     end
 
 
     private
+
+    #
+    # Runs all Spider (path extraction) modules and returns an array of paths
+    #
+    # @return   [Array]   paths
+    #
+    def run_extractors
+        lib = @opts.dir['root'] + 'path_extractors/'
+
+
+        begin
+            @@manager ||= ::Arachni::ComponentManager.new( lib, Extractors )
+
+            return @@manager.available.map {
+                |name|
+                @@manager[name].new.run( doc )
+            }.flatten.uniq
+
+        rescue ::Exception => e
+            print_error( e.to_s )
+            print_debug_backtrace( e )
+        end
+    end
 
     #
     # Merges an array of form inputs with an array of form selects
     #
     # @see #forms