=begin Copyright 2010-2013 Tasos Laskos Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. =end module Arachni lib = Options.dir['lib'] # Load all available element types. Dir.glob( lib + 'element/*.rb' ).each { |f| require f } require lib + 'page' require lib + 'utilities' require lib + 'component/manager' # # HTML Parser # # Analyzes HTML code extracting forms, links and cookies depending on user opts. # # ignored. # # @author Tasos "Zapotek" Laskos # # class Parser include UI::Output include Utilities module Extractors # @author Tasos "Zapotek" Laskos # @abstract class Base # # This method must be implemented by all modules and must return an array # of paths as plain strings # # @param [Nokogiri] doc Nokogiri document # # @return [Array] paths # def run( doc ) end end end alias :skip? :skip_path? # @return [String] The url of the page. attr_reader :url # @return [Options] Options instance. attr_reader :opts # # @param [Typhoeus::Responses, Array] res # Response(s) to analyze and parse into a {Page}. By providing multiple # responses the parser will be able to perform some preliminary differential # analysis and identify nonce tokens in inputs. # # @param [Options] opts # def initialize( res, opts = Options ) @opts = opts if res.is_a? Array @secondary_responses = res[1..-1] @secondary_responses.compact! if @secondary_responses res = res.shift end @code = res.code self.url = res.effective_url @html = res.body @response = res @response_headers = res.headers_hash @doc = nil @paths = nil end def url=( str ) @url = normalize_url( uri_decode( str ) ) @url = normalize_url( str ) if !@url @url end # # Converts a relative URL to an absolute one. # # @param [String] relative_url URL to convert to absolute. # # @return [String] Absolute URL. # def to_absolute( relative_url ) if url = base base_url = url else base_url = @url end super( relative_url, base_url ) end # @return [Page] # Parsed page object based on the given options and HTTP responses. def page req_method = @response.request ? @response.request.method.to_s : 'get' self_link = Link.new( @url, inputs: link_vars( @url ) ) # Non text files won't contain any auditable elements. if !text? page = Page.new( code: @code, url: @url, method: req_method, query_vars: self_link.auditable, body: @html, request_headers: @response.request ? @response.request.headers : {}, response_headers: @response_headers, text: false ) Platform::Manager.fingerprint( page ) if Options.fingerprint? return page end # Extract cookies from the response. c_cookies = cookies # Make a list of the response cookie names. cookie_names = c_cookies.map { |c| c.name } from_jar = [] # If there's a Netscape cookiejar file load cookies from it but only # new ones, i.e. only if they weren't already in the response. if @opts.cookie_jar.is_a?( String ) && File.exists?( @opts.cookie_jar ) from_jar |= cookies_from_file( @url, @opts.cookie_jar ) .reject { |c| cookie_names.include?( c.name ) } end # If we somehow have runtime configuration cookies load them too, but # only if they haven't already been seen. if @opts.cookies && !@opts.cookies.empty? from_jar |= @opts.cookies.reject { |c| cookie_names.include?( c.name ) } end # grab cookies from the HTTP cookiejar and filter out old ones, as usual from_http_jar = HTTP.instance.cookie_jar.cookies.reject do |c| cookie_names.include?( c.name ) end # These cookies are to be audited and thus are dirty and anarchistic, # so they have to contain even cookies completely irrelevant to the # current page. I.e. it contains all cookies that have been observed # since the beginning of the scan cookies_to_be_audited = (c_cookies | from_jar | from_http_jar).map do |c| dc = c.dup dc.action = @url dc end page = Page.new( code: @code, url: @url, query_vars: self_link.auditable, method: req_method, body: @html, request_headers: @response.request ? @response.request.headers : {}, response_headers: @response_headers, document: doc, # All paths seen in the page. paths: paths, forms: forms, # All `href` attributes from `a` elements. links: links | [self_link], cookies: cookies_to_be_audited, headers: headers, # This is the page cookiejar, each time the page is to be audited # by a module, the cookiejar of the HTTP class will be updated # with the cookies specified here. cookiejar: c_cookies | from_jar, # Contains text-based data -- i.e. not a binary response. text: true ) Platform::Manager.fingerprint( page ) if Options.fingerprint? page end alias :run :page # @return [Boolean] # `true` if the given HTTP response data are text based, `false` otherwise. def text? @response.text? end # @return [Nokogiri::HTML, nil] # Returns a parsed HTML document from the body of the HTTP response or # `nil` if the response data wasn't {#text? text-based} or the response # couldn't be parsed. def doc return @doc if @doc @doc = Nokogiri::HTML( @html ) if text? rescue nil end # # @note It's more of a placeholder method, it doesn't actually analyze anything. # It's a long shot that any of these will be vulnerable but better be safe # than sorry. # # @return [Hash] List of valid auditable HTTP header fields. # def headers { 'Accept' => 'text/html,application/xhtml+xml,application' + '/xml;q=0.9,*/*;q=0.8', 'Accept-Charset' => 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 'From' => @opts.authed_by || '', 'User-Agent' => @opts.user_agent || '', 'Referer' => @url, 'Pragma' => 'no-cache' }.map { |k, v| Header.new( @url, { k => v } ) } end # @param [String, Nokogiri::HTML] html # Document to analyze, defaults to {#doc}. # # @return [Array] Forms from `html`. def forms( html = nil ) return [] if !text? && !html f = Form.from_document( @url, html || doc ) return f if !@secondary_responses @secondary_responses.each do |response| next if response.body.to_s.empty? Form.from_document( @url, response.body ).each do |form2| f.each do |form| next if "#{form.id}:#{form.name_or_id}" != "#{form2.id}:#{form2.name_or_id}" form.auditable.each do |k, v| next if !(v != form2.auditable[k] && form.field_type_for( k ) == 'hidden') form.nonce_name = k end end end end f end # @param [String, Nokogiri::HTML] html # Document to analyze, defaults to {#doc}. # # @return [Array] Links in `html`. def links( html = nil ) return [] if !text? && !html if !(vars = link_vars( @url )).empty? || @response.redirection? [Link.new( @url, vars )] else [] end | Link.from_document( @url, html || doc ) end # @param [String] url URL to analyze. # @return [Hash] Parameters found in `url`. def link_vars( url ) Link.parse_query_vars( url ) end # # @return [Array] # Cookies from HTTP headers and response body. def cookies ( Cookie.from_document( @url, doc ) | Cookie.from_headers( @url, @response_headers ) ) end # @return [Array] Distinct links to follow. def paths return @paths unless @paths.nil? @paths = [] return @paths if !doc @paths = run_extractors end # @return [String] `base href`, if there is one. def base @base ||= doc.search( '//base[@href]' ).first['href'] rescue nil end private # # Runs all path extraction components and returns an array of paths. # # @return [Array] Paths. # def run_extractors begin return self.class.extractors.available.map do |name| exception_jail( false ){ self.class.extractors[name].new.run( doc ) } end.flatten.uniq.compact. map { |path| to_absolute( path ) }.compact.uniq. reject { |path| skip?( path ) } rescue ::Exception => e print_error e.to_s print_error_backtrace e end end def self.extractors @manager ||= Component::Manager.new( Options.dir['path_extractors'], Extractors ) end end end