lib/parser/parser.rb in arachni-0.2.4 vs lib/parser/parser.rb in arachni-0.3

- old
+ new

@@ -8,13 +8,15 @@ =end module Arachni opts = Arachni::Options.instance +require 'webrick' require opts.dir['lib'] + 'parser/elements' require opts.dir['lib'] + 'parser/page' require opts.dir['lib'] + 'module/utilities' +require opts.dir['lib'] + 'component_manager' # # Analyzer class # # Analyzes HTML code extracting forms, links and cookies @@ -39,17 +41,46 @@ # Cookies are extracted from the HTTP headers and parsed by WEBrick::Cookie # # @author: Tasos "Zapotek" Laskos # <tasos.laskos@gmail.com> # <zapotek@segfault.gr> -# @version: 0.2 +# @version: 0.2.1 # class Parser - + include Arachni::UI::Output include Arachni::Module::Utilities + module Extractors # + # Base Spider parser class for modules. + # + # The aim of such modules is to extract paths from a webpage for the Spider to follow. + # + # + # @author: Tasos "Zapotek" Laskos + # <tasos.laskos@gmail.com> + # <zapotek@segfault.gr> + # @version: 0.1 + # @abstract + # + class Paths + + # + # This method must be implemented by all modules and must return an array + # of paths as plain strings + # + # @param [Nokogiri] Nokogiri document + # + # @return [Array<String>] paths + # + def run( doc ) + + end + end + end + + # # @return [String] the url of the page # attr_accessor :url # @@ -66,11 +97,11 @@ # @param [Options] opts # def initialize( opts, res ) @opts = opts - @url = res.effective_url + @url = url_sanitize( res.effective_url ) @html = res.body @response_headers = res.headers_hash end # @@ -87,10 +118,11 @@ :url => @url, :query_vars => link_vars( @url ), :html => @html, :headers => [], :response_headers => @response_headers, + :paths => [], :forms => [], :links => [], :cookies => [], :cookiejar => [] } ) @@ -106,18 +138,29 @@ preped = {} cookies_arr.each{ |cookie| preped.merge!( cookie.simple ) } jar = preped.merge( jar ) + c_links = links + + if !( vars = link_vars( @url ) ).empty? + url = to_absolute( @url ) + c_links << Arachni::Parser::Element::Link.new( url, { + 'href' => url, + 'vars' => vars + } ) + end + return Page.new( { :url => @url, :query_vars => link_vars( @url ), :html => @html, :headers => headers(), :response_headers => @response_headers, + :paths => paths(), :forms => @opts.audit_forms ? forms() : [], - :links => @opts.audit_links ? links() : [], + :links => @opts.audit_links ? c_links : [], :cookies => merge_with_cookiestore( merge_with_cookiejar( cookies_arr ) ), :cookiejar => jar } ) end @@ -256,11 +299,11 @@ elements[i]['attrs'] = form_attrs( form ) if( !elements[i]['attrs'] || !elements[i]['attrs']['action'] ) action = @url.to_s else - action = elements[i]['attrs']['action'] + action = url_sanitize( elements[i]['attrs']['action'] ) end action = URI.escape( action ).to_s elements[i]['attrs']['action'] = to_absolute( action.clone ).to_s @@ -320,12 +363,21 @@ if !link['href'] then next end if( exclude?( link['href'] ) ) then next end if( !include?( link['href'] ) ) then next end if !in_domain?( URI.parse( link['href'] ) ) then next end - link['vars'] = link_vars( link['href'] ) + link['vars'] = {} + link_vars( link['href'] ).each_pair { + |key, val| + begin + link['vars'][key] = url_sanitize( val ) + rescue + link['vars'][key] = val + end + } + link['href'] = url_sanitize( link['href'] ) link_arr << Element::Link.new( @url, link ) } @@ -354,15 +406,16 @@ cookies_arr << Element::Cookie.new( @url, { 'name' => k, 'value' => v } ) } rescue end + # don't ask me why.... if @response_headers.to_s.substring?( 'set-cookie' ) begin - cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s ) - cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s ) + cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s ) + cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s ) rescue return cookies_arr end end @@ -388,11 +441,37 @@ } cookies_arr.flatten! return cookies_arr end + def dir( url ) + URI( File.dirname( URI( url.to_s ).path ) + '/' ) + end + # + # Array of distinct links to follow + # + # @return [Array<URI>] + # + def paths + return @paths unless @paths.nil? + @paths = [] + return @paths if !doc + + run_extractors( ).each { + |path| + next if path.nil? or path.empty? + abs = to_absolute( path ) rescue next + + @paths << abs if in_domain?( abs ) + } + + @paths.uniq! + return @paths + end + + # # Extracts variables and their values from a link # # @see #links # # @param [String] link @@ -432,30 +511,40 @@ if URI.parse( link ).host return link end rescue Exception => e return nil if link.nil? - # return link end # remove anchor - link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/, '' ) ) + link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/,'' ) ) - begin - relative = URI(link) - url = URI.parse( @url ) + if url = base + base_url = URI( url ) + else + base_url = URI( @url ) + end - absolute = url.merge(relative) + relative = URI( link ) + absolute = base_url.merge( relative ) - absolute.path = '/' if absolute.path.empty? - rescue Exception => e - return - end + absolute.path = '/' if absolute.path && absolute.path.empty? return absolute.to_s end + + def base + begin + tmp = doc.search( '//base[@href]' ) + return tmp[0]['href'].dup + rescue + return + end + end + + # # Returns +true+ if *uri* is in the same domain as the page, returns # +false+ otherwise # def in_domain?( uri ) @@ -505,9 +594,32 @@ return false end private + + # + # Runs all Spider (path extraction) modules and returns an array of paths + # + # @return [Array] paths + # + def run_extractors + lib = @opts.dir['root'] + 'path_extractors/' + + + begin + @@manager ||= ::Arachni::ComponentManager.new( lib, Extractors ) + + return @@manager.available.map { + |name| + @@manager[name].new.run( doc ) + }.flatten.uniq + + rescue ::Exception => e + print_error( e.to_s ) + print_debug_backtrace( e ) + end + end # # Merges an array of form inputs with an array of form selects # # @see #forms