common.rb in rubysl-uri-2.0.0

- old
+ new

@@ -1,14 +1,20 @@
+#--
 # = uri/common.rb
 #
 # Author:: Akira Yamada <akira@ruby-lang.org>
-# Revision:: $Id: common.rb 14178 2007-12-10 09:31:55Z matz $
-# License:: 
+# Revision:: $Id$
+# License::
 #   You can redistribute it and/or modify it under the same term as Ruby.
 #
+# See URI for general documentation
+#
 
 module URI
+  #
+  # Includes URI::REGEXP::PATTERN
+  #
   module REGEXP
     #
     # Patterns used to parse URI's
     #
     module PATTERN
@@ -29,33 +35,340 @@
       # escaped       = "%" hex hex
       ESCAPED = "%[#{HEX}]{2}"
       # mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
       #                 "(" | ")"
       # unreserved    = alphanum | mark
-      UNRESERVED = "-_.!~*'()#{ALNUM}"
+      UNRESERVED = "\\-_.!~*'()#{ALNUM}"
       # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
       #                 "$" | ","
-      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | 
+      # reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
       #                 "$" | "," | "[" | "]" (RFC 2732)
       RESERVED = ";/?:@&=+$,\\[\\]"
 
+      # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
+      DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
+      # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
+      TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
+      # hostname      = *( domainlabel "." ) toplabel [ "." ]
+      HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
+
+      # :startdoc:
+    end # PATTERN
+
+    # :startdoc:
+  end # REGEXP
+
+  # class that Parses String's into URI's
+  #
+  # It contains a Hash set of patterns and Regexp's that match and validate.
+  #
+  class Parser
+    include REGEXP
+
+    #
+    # == Synopsis
+    #
+    #   URI::Parser.new([opts])
+    #
+    # == Args
+    #
+    # The constructor accepts a hash as options for parser.
+    # Keys of options are pattern names of URI components
+    # and values of options are pattern strings.
+    # The constructor generetes set of regexps for parsing URIs.
+    #
+    # You can use the following keys:
+    #
+    #   * :ESCAPED (URI::PATTERN::ESCAPED in default)
+    #   * :UNRESERVED (URI::PATTERN::UNRESERVED in default)
+    #   * :DOMLABEL (URI::PATTERN::DOMLABEL in default)
+    #   * :TOPLABEL (URI::PATTERN::TOPLABEL in default)
+    #   * :HOSTNAME (URI::PATTERN::HOSTNAME in default)
+    #
+    # == Examples
+    #
+    #   p = URI::Parser.new(:ESCAPED => "(?:%[a-fA-F0-9]{2}|%u[a-fA-F0-9]{4})")
+    #   u = p.parse("http://example.jp/%uABCD") #=> #<URI::HTTP:0xb78cf4f8 URL:http://example.jp/%uABCD>
+    #   URI.parse(u.to_s) #=> raises URI::InvalidURIError
+    #
+    #   s = "http://examle.com/ABCD"
+    #   u1 = p.parse(s) #=> #<URI::HTTP:0xb78c3220 URL:http://example.com/ABCD>
+    #   u2 = URI.parse(s) #=> #<URI::HTTP:0xb78b6d54 URL:http://example.com/ABCD>
+    #   u1 == u2 #=> true
+    #   u1.eql?(u2) #=> false
+    #
+    def initialize(opts = {})
+      @pattern = initialize_pattern(opts)
+      @pattern.each_value {|v| v.freeze}
+      @pattern.freeze
+
+      @regexp = initialize_regexp(@pattern)
+      @regexp.each_value {|v| v.freeze}
+      @regexp.freeze
+    end
+
+    # The Hash of patterns.
+    #
+    # see also URI::Parser.initialize_pattern
+    attr_reader :pattern
+
+    # The Hash of Regexp
+    #
+    # see also URI::Parser.initialize_regexp
+    attr_reader :regexp
+
+    # Returns a split URI against regexp[:ABS_URI]
+    def split(uri)
+      case uri
+      when ''
+        # null uri
+
+      when @regexp[:ABS_URI]
+        scheme, opaque, userinfo, host, port,
+          registry, path, query, fragment = $~[1..-1]
+
+        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+
+        # absoluteURI   = scheme ":" ( hier_part | opaque_part )
+        # hier_part     = ( net_path | abs_path ) [ "?" query ]
+        # opaque_part   = uric_no_slash *uric
+
+        # abs_path      = "/"  path_segments
+        # net_path      = "//" authority [ abs_path ]
+
+        # authority     = server | reg_name
+        # server        = [ [ userinfo "@" ] hostport ]
+
+        if !scheme
+          raise InvalidURIError,
+            "bad URI(absolute but no scheme): #{uri}"
+        end
+        if !opaque && (!path && (!host && !registry))
+          raise InvalidURIError,
+            "bad URI(absolute but no path): #{uri}"
+        end
+
+      when @regexp[:REL_URI]
+        scheme = nil
+        opaque = nil
+
+        userinfo, host, port, registry,
+          rel_segment, abs_path, query, fragment = $~[1..-1]
+        if rel_segment && abs_path
+          path = rel_segment + abs_path
+        elsif rel_segment
+          path = rel_segment
+        elsif abs_path
+          path = abs_path
+        end
+
+        # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+
+        # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
+
+        # net_path      = "//" authority [ abs_path ]
+        # abs_path      = "/"  path_segments
+        # rel_path      = rel_segment [ abs_path ]
+
+        # authority     = server | reg_name
+        # server        = [ [ userinfo "@" ] hostport ]
+
+      else
+        raise InvalidURIError, "bad URI(is not URI?): #{uri}"
+      end
+
+      path = '' if !path && !opaque # (see RFC2396 Section 5.2)
+      ret = [
+        scheme,
+        userinfo, host, port,         # X
+        registry,                     # X
+        path,                         # Y
+        opaque,                       # Y
+        query,
+        fragment
+      ]
+      return ret
+    end
+
+    #
+    # == Args
+    #
+    # +uri+::
+    #    String
+    #
+    # == Description
+    #
+    # parses +uri+ and constructs either matching URI scheme object
+    # (FTP, HTTP, HTTPS, LDAP, LDAPS, or MailTo) or URI::Generic
+    #
+    # == Usage
+    #
+    #   p = URI::Parser.new
+    #   p.parse("ldap://ldap.example.com/dc=example?user=john")
+    #   #=> #<URI::LDAP:0x00000000b9e7e8 URL:ldap://ldap.example.com/dc=example?user=john>
+    #
+    def parse(uri)
+      scheme, userinfo, host, port,
+        registry, path, opaque, query, fragment = self.split(uri)
+
+      if scheme && URI.scheme_list.include?(scheme.upcase)
+        URI.scheme_list[scheme.upcase].new(scheme, userinfo, host, port,
+                                           registry, path, opaque, query,
+                                           fragment, self)
+      else
+        Generic.new(scheme, userinfo, host, port,
+                    registry, path, opaque, query,
+                    fragment, self)
+      end
+    end
+
+
+    #
+    # == Args
+    #
+    # +uris+::
+    #    an Array of Strings
+    #
+    # == Description
+    #
+    # Attempts to parse and merge a set of URIs
+    #
+    def join(*uris)
+      uris[0] = convert_to_uri(uris[0])
+      uris.inject :merge
+    end
+
+    #
+    # :call-seq:
+    #   extract( str )
+    #   extract( str, schemes )
+    #   extract( str, schemes ) {|item| block }
+    #
+    # == Args
+    #
+    # +str+::
+    #    String to search
+    # +schemes+::
+    #    Patterns to apply to +str+
+    #
+    # == Description
+    #
+    # Attempts to parse and merge a set of URIs
+    # If no +block+ given , then returns the result,
+    # else it calls +block+ for each element in result.
+    #
+    # see also URI::Parser.make_regexp
+    #
+    def extract(str, schemes = nil)
+      if block_given?
+        str.scan(make_regexp(schemes)) { yield $& }
+        nil
+      else
+        result = []
+        str.scan(make_regexp(schemes)) { result.push $& }
+        result
+      end
+    end
+
+    # returns Regexp that is default self.regexp[:ABS_URI_REF],
+    # unless +schemes+ is provided. Then it is a Regexp.union with self.pattern[:X_ABS_URI]
+    def make_regexp(schemes = nil)
+      unless schemes
+        @regexp[:ABS_URI_REF]
+      else
+        /(?=#{Regexp.union(*schemes)}:)#{@pattern[:X_ABS_URI]}/x
+      end
+    end
+
+    #
+    # :call-seq:
+    #   escape( str )
+    #   escape( str, unsafe )
+    #
+    # == Args
+    #
+    # +str+::
+    #    String to make safe
+    # +unsafe+::
+    #    Regexp to apply. Defaults to self.regexp[:UNSAFE]
+    #
+    # == Description
+    #
+    # constructs a safe String from +str+, removing unsafe characters,
+    # replacing them with codes.
+    #
+    def escape(str, unsafe = @regexp[:UNSAFE])
+      unless unsafe.kind_of?(Regexp)
+        # perhaps unsafe is String object
+        unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false)
+      end
+      str.gsub(unsafe) do
+        us = $&
+        tmp = ''
+        us.each_byte do |uc|
+          tmp << sprintf('%%%02X', uc)
+        end
+        tmp
+      end.force_encoding(Encoding::US_ASCII)
+    end
+
+    #
+    # :call-seq:
+    #   unescape( str )
+    #   unescape( str, unsafe )
+    #
+    # == Args
+    #
+    # +str+::
+    #    String to remove escapes from
+    # +unsafe+::
+    #    Regexp to apply. Defaults to self.regexp[:ESCAPED]
+    #
+    # == Description
+    #
+    # Removes escapes from +str+
+    #
+    def unescape(str, escaped = @regexp[:ESCAPED])
+      str.gsub(escaped) { [$&[1, 2].hex].pack('C') }.force_encoding(str.encoding)
+    end
+
+    @@to_s = Kernel.instance_method(:to_s)
+    def inspect
+      @@to_s.bind(self).call
+    end
+
+    private
+
+    # Constructs the default Hash of patterns
+    def initialize_pattern(opts = {})
+      ret = {}
+      ret[:ESCAPED] = escaped = (opts.delete(:ESCAPED) || PATTERN::ESCAPED)
+      ret[:UNRESERVED] = unreserved = opts.delete(:UNRESERVED) || PATTERN::UNRESERVED
+      ret[:RESERVED] = reserved = opts.delete(:RESERVED) || PATTERN::RESERVED
+      ret[:DOMLABEL] = opts.delete(:DOMLABEL) || PATTERN::DOMLABEL
+      ret[:TOPLABEL] = opts.delete(:TOPLABEL) || PATTERN::TOPLABEL
+      ret[:HOSTNAME] = hostname = opts.delete(:HOSTNAME)
+
+      # RFC 2396 (URI Generic Syntax)
+      # RFC 2732 (IPv6 Literal Addresses in URL's)
+      # RFC 2373 (IPv6 Addressing Architecture)
+
       # uric          = reserved | unreserved | escaped
-      URIC = "(?:[#{UNRESERVED}#{RESERVED}]|#{ESCAPED})"
+      ret[:URIC] = uric = "(?:[#{unreserved}#{reserved}]|#{escaped})"
       # uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
       #                 "&" | "=" | "+" | "$" | ","
-      URIC_NO_SLASH = "(?:[#{UNRESERVED};?:@&=+$,]|#{ESCAPED})"
+      ret[:URIC_NO_SLASH] = uric_no_slash = "(?:[#{unreserved};?:@&=+$,]|#{escaped})"
       # query         = *uric
-      QUERY = "#{URIC}*"
+      ret[:QUERY] = query = "#{uric}*"
       # fragment      = *uric
-      FRAGMENT = "#{URIC}*"
+      ret[:FRAGMENT] = fragment = "#{uric}*"
 
-      # domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
-      DOMLABEL = "(?:[#{ALNUM}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
-      # toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
-      TOPLABEL = "(?:[#{ALPHA}](?:[-#{ALNUM}]*[#{ALNUM}])?)"
       # hostname      = *( domainlabel "." ) toplabel [ "." ]
-      HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
+      # reg-name      = *( unreserved / pct-encoded / sub-delims ) # RFC3986
+      unless hostname
+        ret[:HOSTNAME] = hostname = "(?:[a-zA-Z0-9\\-.]|%\\h\\h)+"
+      end
 
       # RFC 2373, APPENDIX B:
       # IPv6address = hexpart [ ":" IPv4address ]
       # IPv4address   = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
       # hexpart = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
@@ -64,158 +377,183 @@
       #
       # XXX: This definition has a flaw. "::" + IPv4address must be
       # allowed too.  Here is a replacement.
       #
       # IPv4address = 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT "." 1*3DIGIT
-      IPV4ADDR = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
+      ret[:IPV4ADDR] = ipv4addr = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"
       # hex4     = 1*4HEXDIG
-      HEX4 = "[#{HEX}]{1,4}"
+      hex4 = "[#{PATTERN::HEX}]{1,4}"
       # lastpart = hex4 | IPv4address
-      LASTPART = "(?:#{HEX4}|#{IPV4ADDR})"
+      lastpart = "(?:#{hex4}|#{ipv4addr})"
       # hexseq1  = *( hex4 ":" ) hex4
-      HEXSEQ1 = "(?:#{HEX4}:)*#{HEX4}"
+      hexseq1 = "(?:#{hex4}:)*#{hex4}"
       # hexseq2  = *( hex4 ":" ) lastpart
-      HEXSEQ2 = "(?:#{HEX4}:)*#{LASTPART}"
+      hexseq2 = "(?:#{hex4}:)*#{lastpart}"
       # IPv6address = hexseq2 | [ hexseq1 ] "::" [ hexseq2 ]
-      IPV6ADDR = "(?:#{HEXSEQ2}|(?:#{HEXSEQ1})?::(?:#{HEXSEQ2})?)"
+      ret[:IPV6ADDR] = ipv6addr = "(?:#{hexseq2}|(?:#{hexseq1})?::(?:#{hexseq2})?)"
 
       # IPv6prefix  = ( hexseq1 | [ hexseq1 ] "::" [ hexseq1 ] ) "/" 1*2DIGIT
       # unused
 
       # ipv6reference = "[" IPv6address "]" (RFC 2732)
-      IPV6REF = "\\[#{IPV6ADDR}\\]"
+      ret[:IPV6REF] = ipv6ref = "\\[#{ipv6addr}\\]"
 
       # host          = hostname | IPv4address
       # host          = hostname | IPv4address | IPv6reference (RFC 2732)
-      HOST = "(?:#{HOSTNAME}|#{IPV4ADDR}|#{IPV6REF})"
+      ret[:HOST] = host = "(?:#{hostname}|#{ipv4addr}|#{ipv6ref})"
       # port          = *digit
-      PORT = '\d*'
+      port = '\d*'
       # hostport      = host [ ":" port ]
-      HOSTPORT = "#{HOST}(?::#{PORT})?"
+      ret[:HOSTPORT] = hostport = "#{host}(?::#{port})?"
 
       # userinfo      = *( unreserved | escaped |
       #                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
-      USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})*"
+      ret[:USERINFO] = userinfo = "(?:[#{unreserved};:&=+$,]|#{escaped})*"
 
       # pchar         = unreserved | escaped |
       #                 ":" | "@" | "&" | "=" | "+" | "$" | ","
-      PCHAR = "(?:[#{UNRESERVED}:@&=+$,]|#{ESCAPED})"
+      pchar = "(?:[#{unreserved}:@&=+$,]|#{escaped})"
       # param         = *pchar
-      PARAM = "#{PCHAR}*"
+      param = "#{pchar}*"
       # segment       = *pchar *( ";" param )
-      SEGMENT = "#{PCHAR}*(?:;#{PARAM})*"
+      segment = "#{pchar}*(?:;#{param})*"
       # path_segments = segment *( "/" segment )
-      PATH_SEGMENTS = "#{SEGMENT}(?:/#{SEGMENT})*"
+      ret[:PATH_SEGMENTS] = path_segments = "#{segment}(?:/#{segment})*"
 
       # server        = [ [ userinfo "@" ] hostport ]
-      SERVER = "(?:#{USERINFO}@)?#{HOSTPORT}"
+      server = "(?:#{userinfo}@)?#{hostport}"
       # reg_name      = 1*( unreserved | escaped | "$" | "," |
       #                     ";" | ":" | "@" | "&" | "=" | "+" )
-      REG_NAME = "(?:[#{UNRESERVED}$,;:@&=+]|#{ESCAPED})+"
+      ret[:REG_NAME] = reg_name = "(?:[#{unreserved}$,;:@&=+]|#{escaped})+"
       # authority     = server | reg_name
-      AUTHORITY = "(?:#{SERVER}|#{REG_NAME})"
+      authority = "(?:#{server}|#{reg_name})"
 
       # rel_segment   = 1*( unreserved | escaped |
       #                     ";" | "@" | "&" | "=" | "+" | "$" | "," )
-      REL_SEGMENT = "(?:[#{UNRESERVED};@&=+$,]|#{ESCAPED})+"
+      ret[:REL_SEGMENT] = rel_segment = "(?:[#{unreserved};@&=+$,]|#{escaped})+"
 
       # scheme        = alpha *( alpha | digit | "+" | "-" | "." )
-      SCHEME = "[#{ALPHA}][-+.#{ALPHA}\\d]*"
+      ret[:SCHEME] = scheme = "[#{PATTERN::ALPHA}][\\-+.#{PATTERN::ALPHA}\\d]*"
 
       # abs_path      = "/"  path_segments
-      ABS_PATH = "/#{PATH_SEGMENTS}"
+      ret[:ABS_PATH] = abs_path = "/#{path_segments}"
       # rel_path      = rel_segment [ abs_path ]
-      REL_PATH = "#{REL_SEGMENT}(?:#{ABS_PATH})?"
+      ret[:REL_PATH] = rel_path = "#{rel_segment}(?:#{abs_path})?"
       # net_path      = "//" authority [ abs_path ]
-      NET_PATH   = "//#{AUTHORITY}(?:#{ABS_PATH})?"
+      ret[:NET_PATH] = net_path = "//#{authority}(?:#{abs_path})?"
 
       # hier_part     = ( net_path | abs_path ) [ "?" query ]
-      HIER_PART   = "(?:#{NET_PATH}|#{ABS_PATH})(?:\\?(?:#{QUERY}))?"
+      ret[:HIER_PART] = hier_part = "(?:#{net_path}|#{abs_path})(?:\\?(?:#{query}))?"
       # opaque_part   = uric_no_slash *uric
-      OPAQUE_PART = "#{URIC_NO_SLASH}#{URIC}*"
+      ret[:OPAQUE_PART] = opaque_part = "#{uric_no_slash}#{uric}*"
 
       # absoluteURI   = scheme ":" ( hier_part | opaque_part )
-      ABS_URI   = "#{SCHEME}:(?:#{HIER_PART}|#{OPAQUE_PART})"
+      ret[:ABS_URI] = abs_uri = "#{scheme}:(?:#{hier_part}|#{opaque_part})"
       # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
-      REL_URI = "(?:#{NET_PATH}|#{ABS_PATH}|#{REL_PATH})(?:\\?#{QUERY})?"
+      ret[:REL_URI] = rel_uri = "(?:#{net_path}|#{abs_path}|#{rel_path})(?:\\?#{query})?"
 
       # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
-      URI_REF = "(?:#{ABS_URI}|#{REL_URI})?(?:##{FRAGMENT})?"
+      ret[:URI_REF] = "(?:#{abs_uri}|#{rel_uri})?(?:##{fragment})?"
 
-      # XXX:
-      X_ABS_URI = "
-        (#{PATTERN::SCHEME}):                     (?# 1: scheme)
+      ret[:X_ABS_URI] = "
+        (#{scheme}):                           (?# 1: scheme)
         (?:
-           (#{PATTERN::OPAQUE_PART})              (?# 2: opaque)
+           (#{opaque_part})                    (?# 2: opaque)
         |
            (?:(?:
              //(?:
-                 (?:(?:(#{PATTERN::USERINFO})@)?  (?# 3: userinfo)
-                   (?:(#{PATTERN::HOST})(?::(\\d*))?))?(?# 4: host, 5: port)
+                 (?:(?:(#{userinfo})@)?        (?# 3: userinfo)
+                   (?:(#{host})(?::(\\d*))?))? (?# 4: host, 5: port)
                |
-                 (#{PATTERN::REG_NAME})           (?# 6: registry)
+                 (#{reg_name})                 (?# 6: registry)
                )
              |
-             (?!//))                              (?# XXX: '//' is the mark for hostport)
-             (#{PATTERN::ABS_PATH})?              (?# 7: path)
-           )(?:\\?(#{PATTERN::QUERY}))?           (?# 8: query)
+             (?!//))                           (?# XXX: '//' is the mark for hostport)
+             (#{abs_path})?                    (?# 7: path)
+           )(?:\\?(#{query}))?                 (?# 8: query)
         )
-        (?:\\#(#{PATTERN::FRAGMENT}))?            (?# 9: fragment)
+        (?:\\#(#{fragment}))?                  (?# 9: fragment)
       "
-      X_REL_URI = "
+
+      ret[:X_REL_URI] = "
         (?:
           (?:
             //
             (?:
-              (?:(#{PATTERN::USERINFO})@)?       (?# 1: userinfo)
-                (#{PATTERN::HOST})?(?::(\\d*))?  (?# 2: host, 3: port)
+              (?:(#{userinfo})@)?       (?# 1: userinfo)
+                (#{host})?(?::(\\d*))?  (?# 2: host, 3: port)
             |
-              (#{PATTERN::REG_NAME})             (?# 4: registry)
+              (#{reg_name})             (?# 4: registry)
             )
           )
         |
-          (#{PATTERN::REL_SEGMENT})              (?# 5: rel_segment)
+          (#{rel_segment})              (?# 5: rel_segment)
         )?
-        (#{PATTERN::ABS_PATH})?                  (?# 6: abs_path)
-        (?:\\?(#{PATTERN::QUERY}))?              (?# 7: query)
-        (?:\\#(#{PATTERN::FRAGMENT}))?           (?# 8: fragment)
+        (#{abs_path})?                  (?# 6: abs_path)
+        (?:\\?(#{query}))?              (?# 7: query)
+        (?:\\#(#{fragment}))?           (?# 8: fragment)
       "
-      # :startdoc:
-    end # PATTERN
 
-    # :stopdoc:
+      ret
+    end
 
-    # for URI::split
-    ABS_URI = Regexp.new('^' + PATTERN::X_ABS_URI + '$', #'
-                         Regexp::EXTENDED, 'N').freeze
-    REL_URI = Regexp.new('^' + PATTERN::X_REL_URI + '$', #'
-                         Regexp::EXTENDED, 'N').freeze
+    # Constructs the default Hash of Regexp's
+    def initialize_regexp(pattern)
+      ret = {}
 
-    # for URI::extract
-    URI_REF     = Regexp.new(PATTERN::URI_REF, false, 'N').freeze
-    ABS_URI_REF = Regexp.new(PATTERN::X_ABS_URI, Regexp::EXTENDED, 'N').freeze
-    REL_URI_REF = Regexp.new(PATTERN::X_REL_URI, Regexp::EXTENDED, 'N').freeze
+      # for URI::split
+      ret[:ABS_URI] = Regexp.new('\A\s*' + pattern[:X_ABS_URI] + '\s*\z', Regexp::EXTENDED)
+      ret[:REL_URI] = Regexp.new('\A\s*' + pattern[:X_REL_URI] + '\s*\z', Regexp::EXTENDED)
 
-    # for URI::escape/unescape
-    ESCAPED = Regexp.new(PATTERN::ESCAPED, false, 'N').freeze
-    UNSAFE  = Regexp.new("[^#{PATTERN::UNRESERVED}#{PATTERN::RESERVED}]",
-                         false, 'N').freeze
+      # for URI::extract
+      ret[:URI_REF]     = Regexp.new(pattern[:URI_REF])
+      ret[:ABS_URI_REF] = Regexp.new(pattern[:X_ABS_URI], Regexp::EXTENDED)
+      ret[:REL_URI_REF] = Regexp.new(pattern[:X_REL_URI], Regexp::EXTENDED)
 
-    # for Generic#initialize
-    SCHEME   = Regexp.new("^#{PATTERN::SCHEME}$", false, 'N').freeze #"
-    USERINFO = Regexp.new("^#{PATTERN::USERINFO}$", false, 'N').freeze #"
-    HOST     = Regexp.new("^#{PATTERN::HOST}$", false, 'N').freeze #"
-    PORT     = Regexp.new("^#{PATTERN::PORT}$", false, 'N').freeze #"
-    OPAQUE   = Regexp.new("^#{PATTERN::OPAQUE_PART}$", false, 'N').freeze #"
-    REGISTRY = Regexp.new("^#{PATTERN::REG_NAME}$", false, 'N').freeze #"
-    ABS_PATH = Regexp.new("^#{PATTERN::ABS_PATH}$", false, 'N').freeze #"
-    REL_PATH = Regexp.new("^#{PATTERN::REL_PATH}$", false, 'N').freeze #"
-    QUERY    = Regexp.new("^#{PATTERN::QUERY}$", false, 'N').freeze #"
-    FRAGMENT = Regexp.new("^#{PATTERN::FRAGMENT}$", false, 'N').freeze #"
-    # :startdoc:
-  end # REGEXP
+      # for URI::escape/unescape
+      ret[:ESCAPED] = Regexp.new(pattern[:ESCAPED])
+      ret[:UNSAFE]  = Regexp.new("[^#{pattern[:UNRESERVED]}#{pattern[:RESERVED]}]")
 
+      # for Generic#initialize
+      ret[:SCHEME]   = Regexp.new("\\A#{pattern[:SCHEME]}\\z")
+      ret[:USERINFO] = Regexp.new("\\A#{pattern[:USERINFO]}\\z")
+      ret[:HOST]     = Regexp.new("\\A#{pattern[:HOST]}\\z")
+      ret[:PORT]     = Regexp.new("\\A#{pattern[:PORT]}\\z")
+      ret[:OPAQUE]   = Regexp.new("\\A#{pattern[:OPAQUE_PART]}\\z")
+      ret[:REGISTRY] = Regexp.new("\\A#{pattern[:REG_NAME]}\\z")
+      ret[:ABS_PATH] = Regexp.new("\\A#{pattern[:ABS_PATH]}\\z")
+      ret[:REL_PATH] = Regexp.new("\\A#{pattern[:REL_PATH]}\\z")
+      ret[:QUERY]    = Regexp.new("\\A#{pattern[:QUERY]}\\z")
+      ret[:FRAGMENT] = Regexp.new("\\A#{pattern[:FRAGMENT]}\\z")
+
+      ret
+    end
+
+    def convert_to_uri(uri)
+      if uri.is_a?(URI::Generic)
+        uri
+      elsif uri = String.try_convert(uri)
+        parse(uri)
+      else
+        raise ArgumentError,
+          "bad argument (expected URI object or URI string)"
+      end
+    end
+
+  end # class Parser
+
+  # URI::Parser.new
+  DEFAULT_PARSER = Parser.new
+  DEFAULT_PARSER.pattern.each_pair do |sym, str|
+    unless REGEXP::PATTERN.const_defined?(sym)
+      REGEXP::PATTERN.const_set(sym, str)
+    end
+  end
+  DEFAULT_PARSER.regexp.each_pair do |sym, str|
+    const_set(sym, str)
+  end
+
   module Util # :nodoc:
     def make_components_hash(klass, array_hash)
       tmp = {}
       if array_hash.kind_of?(Array) &&
           array_hash.size == klass.component.size - 1
@@ -234,23 +572,22 @@
           rescue TypeError
             tmp[key] = value
           end
         end
       else
-        raise ArgumentError, 
+        raise ArgumentError,
           "expected Array of or Hash of components of #{klass.to_s} (#{klass.component[1..-1].join(', ')})"
       end
       tmp[:scheme] = klass.to_s.sub(/\A.*::/, '').downcase
 
       return tmp
     end
     module_function :make_components_hash
   end
 
+  # module for escaping unsafe characters with codes.
   module Escape
-    include REGEXP
-
     #
     # == Synopsis
     #
     #   URI.escape(str [, unsafe])
     #
@@ -279,22 +616,13 @@
     #   # => "http://example.com/?a=\t\r"
     #
     #   p URI.escape("@?@!", "!?")
     #   # => "@%3F@%21"
     #
-    def escape(str, unsafe = UNSAFE)
-      unless unsafe.kind_of?(Regexp)
-        # perhaps unsafe is String object
-        unsafe = Regexp.new("[#{Regexp.quote(unsafe)}]", false, 'N')
-      end
-      str.gsub(unsafe) do |us|
-        tmp = ''
-        us.each_byte do |uc|
-          tmp << sprintf('%%%02X', uc)
-        end
-        tmp
-      end
+    def escape(*arg)
+      warn "#{caller(1)[0]}: warning: URI.escape is obsolete" if $VERBOSE
+      DEFAULT_PARSER.escape(*arg)
     end
     alias encode escape
     #
     # == Synopsis
     #
@@ -314,23 +642,26 @@
     #   # => "http://example.com/?a=%09%0D"
     #
     #   p URI.unescape(enc_uri)
     #   # => "http://example.com/?a=\t\r"
     #
-    def unescape(str)
-      str.gsub(ESCAPED) do
-        $&[1,2].hex.chr
-      end
+    def unescape(*arg)
+      warn "#{caller(1)[0]}: warning: URI.unescape is obsolete" if $VERBOSE
+      DEFAULT_PARSER.unescape(*arg)
     end
     alias decode unescape
-  end
+  end # module Escape
 
-  include REGEXP
   extend Escape
+  include REGEXP
 
   @@schemes = {}
-  
+  # Returns a Hash of the defined schemes
+  def self.scheme_list
+    @@schemes
+  end
+
   #
   # Base class for all URI exceptions.
   #
   class Error < StandardError; end
   #
@@ -367,88 +698,20 @@
   #   * Registry
   #   * Path
   #   * Opaque
   #   * Query
   #   * Fragment
-  # 
+  #
   # == Usage
   #
   #   require 'uri'
   #
   #   p URI.split("http://www.ruby-lang.org/")
   #   # => ["http", nil, "www.ruby-lang.org", nil, nil, "/", nil, nil, nil]
   #
   def self.split(uri)
-    case uri
-    when ''
-      # null uri
-
-    when ABS_URI
-      scheme, opaque, userinfo, host, port, 
-        registry, path, query, fragment = $~[1..-1]
-
-      # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
-
-      # absoluteURI   = scheme ":" ( hier_part | opaque_part )
-      # hier_part     = ( net_path | abs_path ) [ "?" query ]
-      # opaque_part   = uric_no_slash *uric
-
-      # abs_path      = "/"  path_segments
-      # net_path      = "//" authority [ abs_path ]
-
-      # authority     = server | reg_name
-      # server        = [ [ userinfo "@" ] hostport ]
-
-      if !scheme
-        raise InvalidURIError, 
-          "bad URI(absolute but no scheme): #{uri}"
-      end
-      if !opaque && (!path && (!host && !registry))
-        raise InvalidURIError,
-          "bad URI(absolute but no path): #{uri}" 
-      end
-
-    when REL_URI
-      scheme = nil
-      opaque = nil
-
-      userinfo, host, port, registry, 
-        rel_segment, abs_path, query, fragment = $~[1..-1]
-      if rel_segment && abs_path
-        path = rel_segment + abs_path
-      elsif rel_segment
-        path = rel_segment
-      elsif abs_path
-        path = abs_path
-      end
-
-      # URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
-
-      # relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
-
-      # net_path      = "//" authority [ abs_path ]
-      # abs_path      = "/"  path_segments
-      # rel_path      = rel_segment [ abs_path ]
-
-      # authority     = server | reg_name
-      # server        = [ [ userinfo "@" ] hostport ]
-
-    else
-      raise InvalidURIError, "bad URI(is not URI?): #{uri}"
-    end
-
-    path = '' if !path && !opaque # (see RFC2396 Section 5.2)
-    ret = [
-      scheme, 
-      userinfo, host, port,         # X
-      registry,                        # X
-      path,                         # Y
-      opaque,                        # Y
-      query,
-      fragment
-    ]
-    return ret
+    DEFAULT_PARSER.split(uri)
   end
 
   #
   # == Synopsis
   #
@@ -460,11 +723,11 @@
   #   String with URI.
   #
   # == Description
   #
   # Creates one of the URI's subclasses instance from the string.
-  #  
+  #
   # == Raises
   #
   # URI::InvalidURIError
   #   Raised if URI given is not a correct one.
   #
@@ -473,28 +736,17 @@
   #   require 'uri'
   #
   #   uri = URI.parse("http://www.ruby-lang.org/")
   #   p uri
   #   # => #<URI::HTTP:0x202281be URL:http://www.ruby-lang.org/>
-  #   p uri.scheme 
-  #   # => "http" 
-  #   p uri.host 
-  #   # => "www.ruby-lang.org" 
-  # 
+  #   p uri.scheme
+  #   # => "http"
+  #   p uri.host
+  #   # => "www.ruby-lang.org"
+  #
   def self.parse(uri)
-    scheme, userinfo, host, port, 
-      registry, path, opaque, query, fragment = self.split(uri)
-
-    if scheme && @@schemes.include?(scheme.upcase)
-      @@schemes[scheme.upcase].new(scheme, userinfo, host, port, 
-                                   registry, path, opaque, query, 
-                                   fragment)
-    else
-      Generic.new(scheme, userinfo, host, port, 
-                  registry, path, opaque, query, 
-                  fragment)
-    end
+    DEFAULT_PARSER.parse(uri)
   end
 
   #
   # == Synopsis
   #
@@ -511,29 +763,38 @@
   #
   # == Usage
   #
   #   require 'uri'
   #
-  #   p URI.join("http://localhost/","main.rbx")
+  #   p URI.join("http://example.com/","main.rbx")
   #   # => #<URI::HTTP:0x2022ac02 URL:http://localhost/main.rbx>
   #
+  #   p URI.join('http://example.com', 'foo')
+  #   # => #<URI::HTTP:0x01ab80a0 URL:http://example.com/foo>
+  #
+  #   p URI.join('http://example.com', '/foo', '/bar')
+  #   # => #<URI::HTTP:0x01aaf0b0 URL:http://example.com/bar>
+  #
+  #   p URI.join('http://example.com', '/foo', 'bar')
+  #   # => #<URI::HTTP:0x801a92af0 URL:http://example.com/bar>
+  #
+  #   p URI.join('http://example.com', '/foo/', 'bar')
+  #   # => #<URI::HTTP:0x80135a3a0 URL:http://example.com/foo/bar>
+  #
+  #
   def self.join(*str)
-    u = self.parse(str[0])
-    str[1 .. -1].each do |x|
-      u = u.merge(x)
-    end
-    u
+    DEFAULT_PARSER.join(*str)
   end
 
   #
   # == Synopsis
   #
   #   URI::extract(str[, schemes][,&blk])
   #
   # == Args
   #
-  # +str+:: 
+  # +str+::
   #   String to extract URIs from.
   # +schemes+::
   #   Limit URI matching to a specific schemes.
   #
   # == Description
@@ -547,67 +808,194 @@
   #
   #   URI.extract("text here http://foo.example.org/bla and here mailto:test@example.com and here also.")
   #   # => ["http://foo.example.com/bla", "mailto:test@example.com"]
   #
   def self.extract(str, schemes = nil, &block)
-    if block_given?
-      str.scan(regexp(schemes)) { yield $& }
-      nil
-    else
-      result = []
-      str.scan(regexp(schemes)) { result.push $& }
-      result
-    end
+    DEFAULT_PARSER.extract(str, schemes, &block)
   end
 
   #
   # == Synopsis
   #
   #   URI::regexp([match_schemes])
   #
   # == Args
   #
-  # +match_schemes+:: 
+  # +match_schemes+::
   #   Array of schemes. If given, resulting regexp matches to URIs
   #   whose scheme is one of the match_schemes.
-  # 
+  #
   # == Description
   # Returns a Regexp object which matches to URI-like strings.
   # The Regexp object returned by this method includes arbitrary
   # number of capture group (parentheses).  Never rely on it's number.
-  # 
+  #
   # == Usage
   #
   #   require 'uri'
   #
   #   # extract first URI from html_string
   #   html_string.slice(URI.regexp)
-  # 
+  #
   #   # remove ftp URIs
   #   html_string.sub(URI.regexp(['ftp'])
-  # 
+  #
   #   # You should not rely on the number of parentheses
   #   html_string.scan(URI.regexp) do |*matches|
   #     p $&
   #   end
   #
   def self.regexp(schemes = nil)
-    unless schemes
-      ABS_URI_REF
+    DEFAULT_PARSER.make_regexp(schemes)
+  end
+
+  TBLENCWWWCOMP_ = {} # :nodoc:
+  256.times do |i|
+    TBLENCWWWCOMP_[i.chr] = '%%%02X' % i
+  end
+  TBLENCWWWCOMP_[' '] = '+'
+  TBLENCWWWCOMP_.freeze
+  TBLDECWWWCOMP_ = {} # :nodoc:
+  256.times do |i|
+    h, l = i>>4, i&15
+    TBLDECWWWCOMP_['%%%X%X' % [h, l]] = i.chr
+    TBLDECWWWCOMP_['%%%x%X' % [h, l]] = i.chr
+    TBLDECWWWCOMP_['%%%X%x' % [h, l]] = i.chr
+    TBLDECWWWCOMP_['%%%x%x' % [h, l]] = i.chr
+  end
+  TBLDECWWWCOMP_['+'] = ' '
+  TBLDECWWWCOMP_.freeze
+
+  HTML5ASCIIINCOMPAT = [Encoding::UTF_7, Encoding::UTF_16BE, Encoding::UTF_16LE,
+    Encoding::UTF_32BE, Encoding::UTF_32LE] # :nodoc:
+
+  # Encode given +str+ to URL-encoded form data.
+  #
+  # This method doesn't convert *, -, ., 0-9, A-Z, _, a-z, but does convert SP
+  # (ASCII space) to + and converts others to %XX.
+  #
+  # This is an implementation of
+  # http://www.w3.org/TR/html5/association-of-controls-and-forms.html#url-encoded-form-data
+  #
+  # See URI.decode_www_form_component, URI.encode_www_form
+  def self.encode_www_form_component(str)
+    str = str.to_s
+    if HTML5ASCIIINCOMPAT.include?(str.encoding)
+      str = str.encode(Encoding::UTF_8)
     else
-      /(?=#{Regexp.union(*schemes)}:)#{PATTERN::X_ABS_URI}/xn
+      str = str.dup
     end
+    str.force_encoding(Encoding::ASCII_8BIT)
+    str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_)
+    str.force_encoding(Encoding::US_ASCII)
   end
 
-end
+  # Decode given +str+ of URL-encoded form data.
+  #
+  # This decodes + to SP.
+  #
+  # See URI.encode_www_form_component, URI.decode_www_form
+  def self.decode_www_form_component(str, enc=Encoding::UTF_8)
+    raise ArgumentError, "invalid %-encoding (#{str})" unless /\A[^%]*(?:%\h\h[^%]*)*\z/ =~ str
+    str.gsub(/\+|%\h\h/, TBLDECWWWCOMP_).force_encoding(enc)
+  end
 
-module Kernel
-  # alias for URI.parse.
+  # Generate URL-encoded form data from given +enum+.
   #
-  # This method is introduced at 1.8.2.
-  def URI(uri_str) # :doc:
-    return uri_str if uri_str.is_a? URI
+  # This generates application/x-www-form-urlencoded data defined in HTML5
+  # from given an Enumerable object.
+  #
+  # This internally uses URI.encode_www_form_component(str).
+  #
+  # This method doesn't convert the encoding of given items, so convert them
+  # before call this method if you want to send data as other than original
+  # encoding or mixed encoding data. (Strings which are encoded in an HTML5
+  # ASCII incompatible encoding are converted to UTF-8.)
+  #
+  # This method doesn't handle files.  When you send a file, use
+  # multipart/form-data.
+  #
+  # This is an implementation of
+  # http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
+  #
+  #    URI.encode_www_form([["q", "ruby"], ["lang", "en"]])
+  #    #=> "q=ruby&lang=en"
+  #    URI.encode_www_form("q" => "ruby", "lang" => "en")
+  #    #=> "q=ruby&lang=en"
+  #    URI.encode_www_form("q" => ["ruby", "perl"], "lang" => "en")
+  #    #=> "q=ruby&q=perl&lang=en"
+  #    URI.encode_www_form([["q", "ruby"], ["q", "perl"], ["lang", "en"]])
+  #    #=> "q=ruby&q=perl&lang=en"
+  #
+  # See URI.encode_www_form_component, URI.decode_www_form
+  def self.encode_www_form(enum)
+    enum.map do |k,v|
+      if v.nil?
+        encode_www_form_component(k)
+      elsif v.respond_to?(:to_ary)
+        v.to_ary.map do |w|
+          str = encode_www_form_component(k)
+          unless w.nil?
+            str << '='
+            str << encode_www_form_component(w)
+          end
+        end.join('&')
+      else
+        str = encode_www_form_component(k)
+        str << '='
+        str << encode_www_form_component(v)
+      end
+    end.join('&')
+  end
 
-    URI.parse(uri_str)
+  WFKV_ = '(?:[^%#=;&]*(?:%\h\h[^%#=;&]*)*)' # :nodoc:
+
+  # Decode URL-encoded form data from given +str+.
+  #
+  # This decodes application/x-www-form-urlencoded data
+  # and returns array of key-value array.
+  # This internally uses URI.decode_www_form_component.
+  #
+  # _charset_ hack is not supported now because the mapping from given charset
+  # to Ruby's encoding is not clear yet.
+  # see also http://www.w3.org/TR/html5/syntax.html#character-encodings-0
+  #
+  # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data
+  #
+  # ary = URI.decode_www_form("a=1&a=2&b=3")
+  # p ary                  #=> [['a', '1'], ['a', '2'], ['b', '3']]
+  # p ary.assoc('a').last  #=> '1'
+  # p ary.assoc('b').last  #=> '3'
+  # p ary.rassoc('a').last #=> '2'
+  # p Hash[ary]            # => {"a"=>"2", "b"=>"3"}
+  #
+  # See URI.decode_www_form_component, URI.encode_www_form
+  def self.decode_www_form(str, enc=Encoding::UTF_8)
+    return [] if str.empty?
+    unless /\A#{WFKV_}=#{WFKV_}(?:[;&]#{WFKV_}=#{WFKV_})*\z/o =~ str
+      raise ArgumentError, "invalid data of application/x-www-form-urlencoded (#{str})"
+    end
+    ary = []
+    $&.scan(/([^=;&]+)=([^;&]*)/) do
+      ary << [decode_www_form_component($1, enc), decode_www_form_component($2, enc)]
+    end
+    ary
+  end
+end # module URI
+
+module Kernel
+
+  #
+  # Returns +uri+ converted to a URI object.
+  #
+  def URI(uri)
+    if uri.is_a?(URI::Generic)
+      uri
+    elsif uri = String.try_convert(uri)
+      URI.parse(uri)
+    else
+      raise ArgumentError,
+        "bad argument (expected URI object or URI string)"
+    end
   end
   module_function :URI
 end