lib/parser/parser.rb in arachni-0.2.4 vs lib/parser/parser.rb in arachni-0.3
- old
+ new
@@ -8,13 +8,15 @@
=end
module Arachni
opts = Arachni::Options.instance
+require 'webrick'
require opts.dir['lib'] + 'parser/elements'
require opts.dir['lib'] + 'parser/page'
require opts.dir['lib'] + 'module/utilities'
+require opts.dir['lib'] + 'component_manager'
#
# Analyzer class
#
# Analyzes HTML code extracting forms, links and cookies
@@ -39,17 +41,46 @@
# Cookies are extracted from the HTTP headers and parsed by WEBrick::Cookie
#
# @author: Tasos "Zapotek" Laskos
# <tasos.laskos@gmail.com>
# <zapotek@segfault.gr>
-# @version: 0.2
+# @version: 0.2.1
#
class Parser
-
+ include Arachni::UI::Output
include Arachni::Module::Utilities
+ module Extractors
#
+ # Base Spider parser class for modules.
+ #
+ # The aim of such modules is to extract paths from a webpage for the Spider to follow.
+ #
+ #
+ # @author: Tasos "Zapotek" Laskos
+ # <tasos.laskos@gmail.com>
+ # <zapotek@segfault.gr>
+ # @version: 0.1
+ # @abstract
+ #
+ class Paths
+
+ #
+ # This method must be implemented by all modules and must return an array
+ # of paths as plain strings
+ #
+ # @param [Nokogiri] Nokogiri document
+ #
+ # @return [Array<String>] paths
+ #
+ def run( doc )
+
+ end
+ end
+ end
+
+ #
# @return [String] the url of the page
#
attr_accessor :url
#
@@ -66,11 +97,11 @@
# @param [Options] opts
#
def initialize( opts, res )
@opts = opts
- @url = res.effective_url
+ @url = url_sanitize( res.effective_url )
@html = res.body
@response_headers = res.headers_hash
end
#
@@ -87,10 +118,11 @@
:url => @url,
:query_vars => link_vars( @url ),
:html => @html,
:headers => [],
:response_headers => @response_headers,
+ :paths => [],
:forms => [],
:links => [],
:cookies => [],
:cookiejar => []
} )
@@ -106,18 +138,29 @@
preped = {}
cookies_arr.each{ |cookie| preped.merge!( cookie.simple ) }
jar = preped.merge( jar )
+ c_links = links
+
+ if !( vars = link_vars( @url ) ).empty?
+ url = to_absolute( @url )
+ c_links << Arachni::Parser::Element::Link.new( url, {
+ 'href' => url,
+ 'vars' => vars
+ } )
+ end
+
return Page.new( {
:url => @url,
:query_vars => link_vars( @url ),
:html => @html,
:headers => headers(),
:response_headers => @response_headers,
+ :paths => paths(),
:forms => @opts.audit_forms ? forms() : [],
- :links => @opts.audit_links ? links() : [],
+ :links => @opts.audit_links ? c_links : [],
:cookies => merge_with_cookiestore( merge_with_cookiejar( cookies_arr ) ),
:cookiejar => jar
} )
end
@@ -256,11 +299,11 @@
elements[i]['attrs'] = form_attrs( form )
if( !elements[i]['attrs'] || !elements[i]['attrs']['action'] )
action = @url.to_s
else
- action = elements[i]['attrs']['action']
+ action = url_sanitize( elements[i]['attrs']['action'] )
end
action = URI.escape( action ).to_s
elements[i]['attrs']['action'] = to_absolute( action.clone ).to_s
@@ -320,12 +363,21 @@
if !link['href'] then next end
if( exclude?( link['href'] ) ) then next end
if( !include?( link['href'] ) ) then next end
if !in_domain?( URI.parse( link['href'] ) ) then next end
- link['vars'] = link_vars( link['href'] )
+ link['vars'] = {}
+ link_vars( link['href'] ).each_pair {
+ |key, val|
+ begin
+ link['vars'][key] = url_sanitize( val )
+ rescue
+ link['vars'][key] = val
+ end
+ }
+ link['href'] = url_sanitize( link['href'] )
link_arr << Element::Link.new( @url, link )
}
@@ -354,15 +406,16 @@
cookies_arr << Element::Cookie.new( @url, { 'name' => k, 'value' => v } )
}
rescue
end
+
# don't ask me why....
if @response_headers.to_s.substring?( 'set-cookie' )
begin
- cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s )
- cookies << WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s )
+ cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['Set-Cookie'].to_s )
+ cookies << ::WEBrick::Cookie.parse_set_cookies( @response_headers['set-cookie'].to_s )
rescue
return cookies_arr
end
end
@@ -388,11 +441,37 @@
}
cookies_arr.flatten!
return cookies_arr
end
+ def dir( url )
+ URI( File.dirname( URI( url.to_s ).path ) + '/' )
+ end
+
#
+ # Array of distinct links to follow
+ #
+ # @return [Array<URI>]
+ #
+ def paths
+ return @paths unless @paths.nil?
+ @paths = []
+ return @paths if !doc
+
+ run_extractors( ).each {
+ |path|
+ next if path.nil? or path.empty?
+ abs = to_absolute( path ) rescue next
+
+ @paths << abs if in_domain?( abs )
+ }
+
+ @paths.uniq!
+ return @paths
+ end
+
+ #
# Extracts variables and their values from a link
#
# @see #links
#
# @param [String] link
@@ -432,30 +511,40 @@
if URI.parse( link ).host
return link
end
rescue Exception => e
return nil if link.nil?
- # return link
end
# remove anchor
- link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/, '' ) )
+ link = URI.encode( link.to_s.gsub( /#[a-zA-Z0-9_-]*$/,'' ) )
- begin
- relative = URI(link)
- url = URI.parse( @url )
+ if url = base
+ base_url = URI( url )
+ else
+ base_url = URI( @url )
+ end
- absolute = url.merge(relative)
+ relative = URI( link )
+ absolute = base_url.merge( relative )
- absolute.path = '/' if absolute.path.empty?
- rescue Exception => e
- return
- end
+ absolute.path = '/' if absolute.path && absolute.path.empty?
return absolute.to_s
end
+
+ def base
+ begin
+ tmp = doc.search( '//base[@href]' )
+ return tmp[0]['href'].dup
+ rescue
+ return
+ end
+ end
+
+
#
# Returns +true+ if *uri* is in the same domain as the page, returns
# +false+ otherwise
#
def in_domain?( uri )
@@ -505,9 +594,32 @@
return false
end
private
+
+ #
+ # Runs all Spider (path extraction) modules and returns an array of paths
+ #
+ # @return [Array] paths
+ #
+ def run_extractors
+ lib = @opts.dir['root'] + 'path_extractors/'
+
+
+ begin
+ @@manager ||= ::Arachni::ComponentManager.new( lib, Extractors )
+
+ return @@manager.available.map {
+ |name|
+ @@manager[name].new.run( doc )
+ }.flatten.uniq
+
+ rescue ::Exception => e
+ print_error( e.to_s )
+ print_debug_backtrace( e )
+ end
+ end
#
# Merges an array of form inputs with an array of form selects
#
# @see #forms