=begin Copyright 2010-2013 Tasos Laskos Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. =end require 'uri' require 'ipaddr' require 'addressable/uri' module Arachni # # Helper method which parses a URL using {Arachni::URI.parse}. # # @see Arachni::URI.parse # def self.URI( uri ) Arachni::URI.parse( uri ) end # # The URI class automatically normalizes the URLs it is passed to parse # while maintaining compatibility with Ruby's URI core class by delegating # missing methods to it -- thus, you can treat it like a Ruby URI and enjoy some # extra perks along the line. # # It also provides *cached* (to maintain a low latency) helper class methods to # ease common operations such as: # # * {.normalize Normalization}. # * Parsing to {.parse Arachni::URI} (see also {.URI}), {.ruby_parse ::URI} or # {.cheap_parse Hash} objects. # * Conversion to {.to_absolute absolute URLs}. # # @author Tasos "Zapotek" Laskos # class URI include UI::Output extend UI::Output include Utilities extend Utilities # # {URI} error namespace. # # All {URI} errors inherit from and live under it. # # @author Tasos "Zapotek" Laskos # class Error < Arachni::Error end CACHE_SIZES = { parse: 600, ruby_parse: 600, cheap_parse: 600, normalize: 1000, to_absolute: 1000 } CACHE = { parser: ::URI::Parser.new, ruby_parse: Support::Cache::RandomReplacement.new( CACHE_SIZES[:ruby_parse] ), parse: Support::Cache::RandomReplacement.new( CACHE_SIZES[:parse] ), cheap_parse: Support::Cache::RandomReplacement.new( CACHE_SIZES[:cheap_parse] ), normalize: Support::Cache::RandomReplacement.new( CACHE_SIZES[:normalize] ), to_absolute: Support::Cache::RandomReplacement.new( CACHE_SIZES[:to_absolute] ) } # @return [URI::Parser] cached URI parser def self.parser CACHE[__method__] end # # URL encodes a string. # # @param [String] string # @param [String, Regexp] bad_characters # Class of characters to encode -- if {String} is passed, it should # formatted as a regexp (for `Regexp.new`). # # @return [String] encoded string # def self.encode( string, bad_characters = nil ) Addressable::URI.encode_component( *[string, bad_characters].compact ) end # # URL decodes a string. # # @param [String] string # # @return [String] # def self.decode( string ) Addressable::URI.unencode( string ) end # # Iteratively {.decode URL decodes} a {String} until there are no more # characters to be unescaped. # # @param [String] string # # @return [String] # def self.deep_decode( string ) string = decode( string ) while string =~ /%[a-fA-F0-9]{2}/ end # # Cached version of {URI#initialize}, if there's a chance that the same # URL will be needed to be parsed multiple times you should use this method. # # @note This method's results are cached for performance reasons. # If you plan on doing something destructive with its return value duplicate # it first because there may be references to it elsewhere. # # @see URI#initialize # def self.parse( url ) return url if !url || url.is_a?( Arachni::URI ) CACHE[__method__][url] ||= begin new( url ) rescue => e print_debug "Failed to parse '#{url}'." print_debug "Error: #{e}" print_debug_backtrace( e ) nil end end # # {.normalize Normalizes} `url` and uses Ruby's core URI lib to parse it. # # @note This method's results are cached for performance reasons. # If you plan on doing something destructive with its return value duplicate # it first because there may be references to it elsewhere. # # @param [String] url URL to parse # # @return [URI] # def self.ruby_parse( url ) return url if url.to_s.empty? || url.is_a?( ::URI ) CACHE[__method__][url] ||= begin ::URI::Generic.build( cheap_parse( url ) ) rescue begin parser.parse( normalize( url ).dup ) rescue => e print_debug "Failed to parse '#{url}'." print_debug "Error: #{e}" print_debug_backtrace( e ) nil end end end # # Performs a parse that is less resource intensive than Ruby's URI lib's # method while normalizing the URL (will also discard the fragment and # path parameters). # # @param [String] url # # @return [Hash] # URL components (frozen): # # * `:scheme` -- HTTP or HTTPS # * `:userinfo` -- `username:password` # * `:host` # * `:port` # * `:path` # * `:query` # # @note This method's results are cached for performance reasons. # If you plan on doing something destructive with its return value duplicate # it first because there may be references to it elsewhere. # # @note The Hash is suitable for passing to `::URI::Generic.build` -- if # however you plan on doing that you'll be better off just using # {.ruby_parse} which does the same thing and caches the results for some # extra schnell. # def self.cheap_parse( url ) return if !url || url.empty? cache = CACHE[__method__] url = url.to_s.dup c_url = url.to_s.dup components = { scheme: nil, userinfo: nil, host: nil, port: nil, path: nil, query: nil } valid_schemes = %w(http https) begin if (v = cache[url]) && v == :err return elsif v return v end # we're not smart enough for scheme-less URLs and if we're to go # into heuristics then there's no reason to not just use Addressable's parser if url.start_with?( '//' ) return cache[c_url] = addressable_parse( c_url ).freeze end url = url.encode( 'UTF-8', undef: :replace, invalid: :replace ) # remove the fragment if there is one url = url.split( '#', 2 )[0...-1].join if url.include?( '#' ) url = html_decode( url ) dupped_url = url.dup has_path = true splits = url.split( ':' ) if !splits.empty? && valid_schemes.include?( splits.first.downcase ) splits = url.split( '://', 2 ) components[:scheme] = splits.shift components[:scheme].downcase! if components[:scheme] if url = splits.shift splits = url.split( '?' ).first.split( '@', 2 ) if splits.size > 1 components[:userinfo] = splits.first url = splits.shift end if !splits.empty? splits = splits.last.split( '/', 2 ) url = splits.last splits = splits.first.split( ':', 2 ) if splits.size == 2 host = splits.first components[:port] = Integer( splits.last ) if splits.last && !splits.last.empty? components[:port] = nil if components[:port] == 80 url.gsub!( ':' + components[:port].to_s, '' ) else host = splits.last end if components[:host] = host url.gsub!( host, '' ) components[:host].downcase! end else has_path = false end else has_path = false end end if has_path splits = url.split( '?', 2 ) if components[:path] = splits.shift components[:path] = '/' + components[:path] if components[:scheme] components[:path].gsub!( /\/+/, '/' ) components[:path] = encode( decode( components[:path] ), Addressable::URI::CharacterClasses::PATH ) end if c_url.include?( '?' ) && !(query = dupped_url.split( '?', 2 ).last).empty? components[:query] = (query.split( '&', -1 ).map do |pair| Addressable::URI.normalize_component( pair, Addressable::URI::CharacterClasses::QUERY.sub( '\\&', '' ), '+' ) end).join( '&' ) end end components[:path] ||= components[:scheme] ? '/' : nil # Remove path params if components[:path] components[:path] = components[:path].split( ';', 2 ).first end components.values.each( &:freeze ) cache[c_url] = components.freeze rescue => e begin print_debug "Failed to fast-parse '#{c_url}', falling back to slow-parse." print_debug "Error: #{e}" print_debug_backtrace( e ) cache[c_url] = addressable_parse( c_url ).freeze rescue => ex print_debug "Failed to parse '#{c_url}'." print_debug "Error: #{ex}" print_debug_backtrace( ex ) cache[c_url] = :err nil end end end # # Performs a parse using the `URI::Addressable` lib while normalizing the # URL (will also discard the fragment). # # This method is not cached and solely exists as a fallback used by {.cheap_parse}. # # @param [String] url # # @return [Hash] # URL components: # # * `:scheme` -- HTTP or HTTPS # * `:userinfo` -- `username:password` # * `:host` # * `:port` # * `:path` # * `:query` # # @note The Hash is suitable for passing to `::URI::Generic.build` -- if # however you plan on doing that you'll be better off just using # {.ruby_parse} which does the same thing and caches the results for some # extra schnell. # def self.addressable_parse( url ) u = Addressable::URI.parse( html_decode( url.to_s ) ).normalize u.fragment = nil h = u.to_hash h[:path].gsub!( /\/+/, '/' ) if h[:path] if h[:user] h[:userinfo] = h.delete( :user ) h[:userinfo] << ":#{h.delete( :password )}" if h[:password] end h end # # {.normalize Normalizes} and converts a `relative` URL to an absolute one # by merging in with a `reference` URL. # # Pretty much a cached version of {#to_absolute}. # # @note This method's results are cached for performance reasons. # If you plan on doing something destructive with its return value duplicate # it first because there may be references to it elsewhere. # # @param [String] relative # @param [String] reference absolute url to use as a reference # # @return [String] absolute URL (frozen) # def self.to_absolute( relative, reference = Options.instance.url.to_s ) return reference if !relative || relative.empty? key = relative + ' :: ' + reference cache = CACHE[__method__] begin if (v = cache[key]) && v == :err return elsif v return v end parsed_ref = parse( reference ) # scheme-less URLs are expensive to parse so let's resolve the issue here relative = "#{parsed_ref.scheme}:#{relative}" if relative.start_with?( '//' ) cache[key] = parse( relative ).to_absolute( parsed_ref ).to_s.freeze rescue cache[key] = :err nil end end # # Uses {.cheap_parse} to parse and normalize the URL and then converts # it to a common {String} format. # # @note This method's results are cached for performance reasons. # If you plan on doing something destructive with its return value duplicate # it first because there may be references to it elsewhere. # # @param [String] url # # @return [String] Normalized URL (frozen). # def self.normalize( url ) return if !url || url.empty? cache = CACHE[__method__] url = url.to_s.strip.dup c_url = url.to_s.strip.dup begin if (v = cache[url]) && v == :err return elsif v return v end components = cheap_parse( url ) normalized = '' normalized << components[:scheme] + '://' if components[:scheme] if components[:userinfo] normalized << components[:userinfo] normalized << '@' end if components[:host] normalized << components[:host] normalized << ':' + components[:port].to_s if components[:port] end normalized << components[:path] if components[:path] normalized << '?' + components[:query] if components[:query] cache[c_url] = normalized.freeze rescue => e print_debug "Failed to normalize '#{c_url}'." print_debug "Error: #{e}" print_debug_backtrace( e ) cache[c_url] = :err nil end end # # {.normalize Normalizes} and parses the provided URL. # # Will discard the fragment component, if there is one. # # @param [Arachni::URI, String, URI, Hash] url # {String} URL to parse, `URI` to convert, or a `Hash` holding URL components # (for `URI::Generic.build`). Also accepts {Arachni::URI} for convenience. # def initialize( url ) @arachni_opts = Options.instance @parsed_url = case url when String self.class.ruby_parse( url ) when ::URI url.dup when Hash ::URI::Generic.build( url ) when Arachni::URI self.parsed_url = url.parsed_url.dup else to_string = url.to_s rescue '' msg = "Argument must either be String, URI or Hash" msg << " -- #{url.class.name} '#{to_string}' passed." fail TypeError.new( msg ) end fail Error, 'Failed to parse URL.' if !@parsed_url end def ==( other ) to_s == other.to_s end # # Converts self into an absolute URL using `reference` to fill in the missing data. # # @param [Arachni::URI, URI, String] reference Full, absolute URL. # # @return [Arachni::URI] Self, as an absolute URL. # def to_absolute( reference ) absolute = case reference when Arachni::URI reference.parsed_url when ::URI reference else self.class.new( reference.to_s ).parsed_url end.merge( @parsed_url ) self.class.new( absolute ) end # @return [String] # The URL up to its resource component (query, fragment, etc). def without_query to_s.split( '?', 2 ).first.to_s end # @return [String] The extension of the URI resource. def resource_extension resource_name = path.split( '/' ).last.to_s return if !resource_name.include?( '.' ) resource_name.split( '.' ).last end # @return [String] # The URL up to its path component (no resource name, query, fragment, etc). def up_to_path return if !path uri_path = path.dup uri_path = File.dirname( uri_path ) if !File.extname( path ).empty? uri_path << '/' if uri_path[-1] != '/' uri_str = "#{scheme}://#{host}" uri_str << ':' + port.to_s if port && port != 80 uri_str << uri_path end # @return [String] domain_name.tld def domain return host if ip_address? s = host.split( '.' ) return s.first if s.size == 1 return host if s.size == 2 s[1..-1].join( '.' ) end # @return [Boolean] # `true` if the URI contains an IP address, `false` otherwise. def ip_address? !(IPAddr.new( host ) rescue nil).nil? end # # Checks if self exceeds a given directory `depth`. # # @param [Integer] depth Depth to check for. # # @return [Bool] `true` if self is deeper than `depth`, `false` otherwise. # def too_deep?( depth ) depth.to_i > 0 && (depth + 1) <= path.to_s.count( '/' ) end # # Checks if self should be excluded based on the provided `patterns`. # # @param [Array] patterns # # @return [Bool] `true` if self matches a pattern, `false` otherwise. # def exclude?( patterns ) fail TypeError.new( 'Array expected, got nil instead' ) if patterns.nil? ensure_patterns( patterns ).each { |pattern| return true if to_s =~ pattern } false end # # Checks if self should be included based on the provided `patterns`. # # @param [Array] patterns # # @return [Bool] # `true` if self matches a pattern (or `patterns` are `nil` or empty), # `false` otherwise. # def include?( patterns ) fail TypeError.new( 'Array expected, got nil instead' ) if patterns.nil? rules = ensure_patterns( patterns ) return true if !rules || rules.empty? rules.each { |pattern| return true if to_s =~ pattern } false end # # @param [Bool] include_subdomain Match subdomains too? # If true will compare full hostnames, otherwise will discard subdomains. # # @param [Arachni::URI, URI, Hash, String] other Reference URL. # # @return [Bool] # `true` if self is in the same domain as the `other` URL, false otherwise. # def in_domain?( include_subdomain, other ) return true if !other other = self.class.new( other ) if !other.is_a?( Arachni::URI ) include_subdomain ? other.host == host : other.domain == domain rescue false end def mailto? scheme == 'mailto' end # @return [String] URL def to_s @parsed_url.to_s end def hash to_s.hash end def persistent_hash to_s.persistent_hash end protected def parsed_url @parsed_url end def parsed_url=( url ) @parsed_url = url end private def ensure_patterns( arr ) if arr.is_a?( Array ) arr else [arr].flatten end.compact.map { |p| p.is_a?( Regexp ) ? p : Regexp.new( p.to_s ) } end # # Delegates unimplemented methods to Ruby's `URI::Generic` class for # compatibility. # def method_missing( sym, *args, &block ) if @parsed_url.respond_to?( sym ) @parsed_url.send( sym, *args, &block ) else super end end def respond_to?( sym ) super || @parsed_url.respond_to?( sym ) end end end