=begin Arachni Copyright (c) 2010-2011 Tasos "Zapotek" Laskos This is free software; you can copy and distribute and modify this program under the term of the GPL v2.0 License (See LICENSE file for details) =end require 'nokogiri' require Arachni::Options.instance.dir['lib'] + 'nokogiri/xml/node' require 'ostruct' require 'webrick/cookie' # # Overides Anemone's Page class methods:
# o in_domain?( uri ): adding support for subdomain crawling
# o links(): adding support for frame and iframe src URLs
# # @author: Tasos "Zapotek" Laskos # # # @version: 0.1 # module Anemone module Extractors # # Base Spider parser class for modules. # # The aim of such modules is to extract paths from a webpage for the Spider to follow. # # # @author: Tasos "Zapotek" Laskos # # # @version: 0.1 # @abstract # class Paths # # This method must be implemented by all modules and must return an array # of paths as plain strings # # @param [Nokogiri] Nokogiri document # # @return [Array] paths # def parse( doc ) end end end class Page include Arachni::UI::Output # The URL of the page attr_reader :url # The raw HTTP response body of the page attr_reader :body # Headers of the HTTP response attr_reader :headers # URL of the page this one redirected to, if any attr_reader :redirect_to # Exception object, if one was raised during HTTP#fetch_page attr_reader :error # OpenStruct for user-stored data attr_accessor :data # Integer response code of the page attr_accessor :code # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths! attr_accessor :visited # Depth of this page from the root of the crawl. This is not necessarily the # shortest path; use PageStore#shortest_paths! to find that value. attr_accessor :depth # URL of the page that brought us to this page attr_accessor :referer # Response time of the request for this page in milliseconds attr_accessor :response_time # # Create a new page # def initialize(url, params = {}) @url = url @data = OpenStruct.new @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end # # Runs all Spider (path extraction) modules and returns an array of paths # # @return [Array] paths # def run_modules opts = Arachni::Options.instance require opts.dir['lib'] + 'component_manager' lib = opts.dir['root'] + 'path_extractors/' begin @@manager ||= ::Arachni::ComponentManager.new( lib, Extractors ) return @@manager.available.map { |name| @@manager[name].new.run( doc ) }.flatten.uniq rescue ::Exception => e print_error( e.to_s ) print_debug_backtrace( e ) end end def dir( url ) URI( File.dirname( URI( url.to_s ).path ) + '/' ) end # # Array of distinct links to follow # # @return [Array] # def links return @links unless @links.nil? @links = [] return @links if !doc run_modules( ).each { |path| next if path.nil? or path.empty? abs = to_absolute( URI( path ) ) rescue next if in_domain?( abs ) @links << abs # force dir listing # ap to_absolute( get_path( abs.to_s ).to_s ).to_s # @links << to_absolute( dir( abs.to_s ).to_s ) rescue next end } @links.uniq! return @links end # # Nokogiri document for the HTML body # def doc type = Arachni::HTTP.content_type( @headers ) return if type.is_a?( String) && !type.substring?( 'text' ) return @doc if @doc @doc = Nokogiri::HTML( @body ) if @body rescue nil end # # Delete the Nokogiri document and response body to conserve memory # def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end # # Was the page successfully fetched? # +true+ if the page was fetched with no error, +false+ otherwise. # def fetched? @fetched end # # Array of cookies received with this page as WEBrick::Cookie objects. # def cookies WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue [] end # # The content-type returned by the HTTP request for this page # def content_type headers['content-type'].first end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise. # def redirect? (300..307).include?(@code) end # # Returns +true+ if the page was not found (returned 404 code), # returns +false+ otherwise. # def not_found? 404 == @code end # # Converts relative URL *link* into an absolute URL based on the # location of the page # def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) if url = base base_url = URI(url) else base_url = @url.dup end relative = URI(link) absolute = base_url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end def base begin tmp = doc.search( '//base[@href]' ) return tmp[0]['href'].dup rescue return end end # # Returns +true+ if *uri* is in the same domain as the page, returns # +false+ otherwise. # # The added code enables optional subdomain crawling. # def in_domain?( uri ) if( Arachni::Options.instance.follow_subdomains ) return extract_domain( uri ) == extract_domain( @url ) end uri.host == @url.host end # # Extracts the domain from a URI object # # @param [URI] url # # @return [String] # def extract_domain( url ) if !url.host then return false end splits = url.host.split( /\./ ) if splits.length == 1 then return true end splits[-2] + "." + splits[-1] end def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => URI(hash['redirect_to']), '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end end end