app/models/chunks/uri.rb in instiki-0.9.2 vs app/models/chunks/uri.rb in instiki-0.10.0
- old
+ new
@@ -1,97 +1,182 @@
-require 'chunks/chunk'
-
-# This wiki chunk matches arbitrary URIs, using patterns from the Ruby URI modules.
-# It parses out a variety of fields that could be used by renderers to format
-# the links in various ways (shortening domain names, hiding email addresses)
-# It matches email addresses and host.com.au domains without schemes (http://)
-# but adds these on as required.
-#
-# The heuristic used to match a URI is designed to err on the side of caution.
-# That is, it is more likely to not autolink a URI than it is to accidently
-# autolink something that is not a URI. The reason behind this is it is easier
-# to force a URI link by prefixing 'http://' to it than it is to escape and
-# incorrectly marked up non-URI.
-#
-# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes.
-# The generic names are from www.bnoack.com/data/countrycode2.html)
-# [iso3166]: http://geotags.com/iso3166/
-class URIChunk < Chunk::Abstract
- include URI::REGEXP::PATTERN
-
- GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)'
- COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)'
-
- # These are needed otherwise HOST will match almost anything
- TLDS = "\\.(?:#{GENERIC}|#{COUNTRY})"
-
- # Redefine USERINFO so that it must have non-zero length
- USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+"
-
- # Pattern of legal URI endings to stop interference with some Textile
- # markup. (Images: !URI!) and other punctuation eg, (http://wiki.com/)
- URI_ENDING = '[)!]'
-
- # The basic URI expression as a string
- URI_PATTERN =
- "(?:(#{SCHEME})://)?" + # Optional scheme:// (\1|\8)
- "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2|\9)
- "(#{HOSTNAME}#{TLDS})" + # Mandatory host eg, HOST.com.au (\3|\10)
- "(?::(#{PORT}))?" + # Optional :port (\4|\11)
- "(#{ABS_PATH})?" + # Optional absolute path (\5|\12)
- "(?:\\?(#{QUERY}))?" + # Optional ?query (\6|\13)
- "(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7|\14)
-
- def self.pattern()
- # This pattern first tries to match the URI_PATTERN that ends with
- # punctuation that is a valid URI character (eg, ')', '!'). If
- # such a match occurs, there should be no backtracking (hence the ?> ).
- # If the string cannot match a URI ending with URI_ENDING, then a second
- # attempt is tried.
- Regexp.new("(?>#{URI_PATTERN}(?=#{URI_ENDING}))|#{URI_PATTERN}", Regexp::EXTENDED, 'N')
- end
-
- attr_reader :uri, :scheme, :user, :host, :port, :path, :query, :fragment, :link_text
-
- def initialize(match_data)
- super(match_data)
- # Since the URI_PATTERN is tried twice, there are two sets of
- # groups, one from \1 to \7 and the second from \8 to \14.
- # The fields are set by which ever group matches.
- @scheme = match_data[1] || match_data[8]
- @user = match_data[2] || match_data[9]
- @host = match_data[3] || match_data[10]
- @port = match_data[4] || match_data[11]
- @path = match_data[5] || match_data[12]
- @query = match_data[6] || match_data[13]
- @fragment = match_data[7] || match_data[14]
-
- # If there is no scheme, add an appropriate one, otherwise
- # set the URI to the matched text.
- @text_scheme = scheme
- @uri = (scheme ? match_data[0] : nil )
- @scheme = scheme || ( user ? 'mailto' : 'http' )
- @delimiter = ( scheme == 'mailto' ? ':' : '://' )
- @uri ||= scheme + @delimiter + match_data[0]
-
- # Build up the link text. Schemes are omitted unless explicitly given.
- @link_text = ''
- @link_text << "#{@scheme}#{@delimiter}" if @text_scheme
- @link_text << "#{@user}@" if @user
- @link_text << "#{@host}" if @host
- @link_text << ":#{@port}" if @port
- @link_text << "#{@path}" if @path
- @link_text << "?#{@query}" if @query
- end
-
- # If the text should be escaped then don't keep this chunk.
- # Otherwise only keep this chunk if it was substituted back into the
- # content.
- def unmask(content)
- return nil if escaped_text
- return self if content.sub!( Regexp.new(mask(content)), "<a href=\"#{uri}\">#{link_text}</a>" )
- end
-
- # If there is no hostname in the URI, do not render it
- # It's probably only contains the scheme, eg 'something:'
- def escaped_text() ( host.nil? ? @uri : nil ) end
-end
+require 'chunks/chunk'
+
+# This wiki chunk matches arbitrary URIs, using patterns from the Ruby URI modules.
+# It parses out a variety of fields that could be used by renderers to format
+# the links in various ways (shortening domain names, hiding email addresses)
+# It matches email addresses and host.com.au domains without schemes (http://)
+# but adds these on as required.
+#
+# The heuristic used to match a URI is designed to err on the side of caution.
+# That is, it is more likely to not autolink a URI than it is to accidently
+# autolink something that is not a URI. The reason behind this is it is easier
+# to force a URI link by prefixing 'http://' to it than it is to escape and
+# incorrectly marked up non-URI.
+#
+# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes.
+# The generic names are from www.bnoack.com/data/countrycode2.html)
+# [iso3166]: http://geotags.com/iso3166/
+
+class URIChunk < Chunk::Abstract
+ include URI::REGEXP::PATTERN
+
+ # this condition is to get rid of pesky warnings in tests
+ unless defined? URIChunk::INTERNET_URI_REGEXP
+
+ GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org'
+
+ COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' +
+ 'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' +
+ 'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' +
+ 'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' +
+ 'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' +
+ 'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' +
+ 'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' +
+ 'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' +
+ 'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' +
+ 'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' +
+ 'ws|ye|yt|yu|za|zm|zr|zw'
+ # These are needed otherwise HOST will match almost anything
+ TLDS = "(?:#{GENERIC}|#{COUNTRY})"
+
+ # Redefine USERINFO so that it must have non-zero length
+ USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+"
+
+ # unreserved_no_ending = alphanum | mark, but URI_ENDING [)!] excluded
+ UNRESERVED_NO_ENDING = "-_.~*'(#{ALNUM}"
+
+ # this ensures that query or fragment do not end with URI_ENDING
+ # and enable us to use a much simpler self.pattern Regexp
+
+ # uric_no_ending = reserved | unreserved_no_ending | escaped
+ URIC_NO_ENDING = "(?:[#{UNRESERVED_NO_ENDING}#{RESERVED}]|#{ESCAPED})"
+ # query = *uric
+ QUERY = "#{URIC_NO_ENDING}*"
+ # fragment = *uric
+ FRAGMENT = "#{URIC_NO_ENDING}*"
+
+ # DOMLABEL is defined in the ruby uri library, TLDS is defined above
+ INTERNET_HOSTNAME = "(?:#{DOMLABEL}\\.)+#{TLDS}"
+
+ # Correct a typo bug in ruby 1.8.x lib/uri/common.rb
+ PORT = '\\d*'
+
+ INTERNET_URI =
+ "(?:(#{SCHEME}):/{0,2})?" + # Optional scheme: (\1)
+ "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2)
+ "(#{INTERNET_HOSTNAME})" + # Mandatory hostname (\3)
+ "(?::(#{PORT}))?" + # Optional :port (\4)
+ "(#{ABS_PATH})?" + # Optional absolute path (\5)
+ "(?:\\?(#{QUERY}))?" + # Optional ?query (\6)
+ "(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7)
+ '(?=\.?(?:\s|\)|\z))' # ends only with optional dot + space or ")"
+ # or end of the string
+
+ SUSPICIOUS_PRECEDING_CHARACTER = '(!|\"\:|\"|\\\')?' # any of !, ":, ", '
+
+ INTERNET_URI_REGEXP =
+ Regexp.new(SUSPICIOUS_PRECEDING_CHARACTER + INTERNET_URI, Regexp::EXTENDED, 'N')
+
+ end
+
+ def URIChunk.pattern
+ INTERNET_URI_REGEXP
+ end
+
+ attr_reader :user, :host, :port, :path, :query, :fragment, :link_text
+
+ def self.apply_to(content)
+ content.gsub!( self.pattern ) do |matched_text|
+ chunk = self.new($~, content)
+ if chunk.avoid_autolinking?
+ # do not substitute nor register the chunk
+ matched_text
+ else
+ content.add_chunk(chunk)
+ chunk.mask
+ end
+ end
+ end
+
+ def initialize(match_data, content)
+ super
+ @link_text = match_data[0]
+ @suspicious_preceding_character = match_data[1]
+ @original_scheme, @user, @host, @port, @path, @query, @fragment = match_data[2..-1]
+ treat_trailing_character
+ @unmask_text = "<a href=\"#{uri}\">#{link_text}</a>"
+ end
+
+ def avoid_autolinking?
+ not @suspicious_preceding_character.nil?
+ end
+
+ def treat_trailing_character
+ # If the last character matched by URI pattern is in ! or ), this may be part of the markup,
+ # not a URL. We should handle it as such. It is possible to do it by a regexp, but
+ # much easier to do programmatically
+ last_char = @link_text[-1..-1]
+ if last_char == ')' or last_char == '!'
+ @trailing_punctuation = last_char
+ @link_text.chop!
+ [@original_scheme, @user, @host, @port, @path, @query, @fragment].compact.last.chop!
+ else
+ @trailing_punctuation = nil
+ end
+ end
+
+ def scheme
+ @original_scheme or (@user ? 'mailto' : 'http')
+ end
+
+ def scheme_delimiter
+ scheme == 'mailto' ? ':' : '://'
+ end
+
+ def user_delimiter
+ '@' unless @user.nil?
+ end
+
+ def port_delimiter
+ ':' unless @port.nil?
+ end
+
+ def query_delimiter
+ '?' unless @query.nil?
+ end
+
+ def uri
+ [scheme, scheme_delimiter, user, user_delimiter, host, port_delimiter, port, path,
+ query_delimiter, query].compact.join
+ end
+
+end
+
+# uri with mandatory scheme but less restrictive hostname, like
+# http://localhost:2500/blah.html
+class LocalURIChunk < URIChunk
+
+ unless defined? LocalURIChunk::LOCAL_URI_REGEXP
+ # hostname can be just a simple word like 'localhost'
+ ANY_HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?"
+
+ # The basic URI expression as a string
+ # Scheme and hostname are mandatory
+ LOCAL_URI =
+ "(?:(#{SCHEME})://)+" + # Mandatory scheme:// (\1)
+ "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2)
+ "(#{ANY_HOSTNAME})" + # Mandatory hostname (\3)
+ "(?::(#{PORT}))?" + # Optional :port (\4)
+ "(#{ABS_PATH})?" + # Optional absolute path (\5)
+ "(?:\\?(#{QUERY}))?" + # Optional ?query (\6)
+ "(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7)
+ '(?=\.?(?:\s|\)|\z))' # ends only with optional dot + space or ")"
+ # or end of the string
+
+ LOCAL_URI_REGEXP = Regexp.new(SUSPICIOUS_PRECEDING_CHARACTER + LOCAL_URI, Regexp::EXTENDED, 'N')
+ end
+
+ def LocalURIChunk.pattern
+ LOCAL_URI_REGEXP
+ end
+
+end