app/models/chunks/uri.rb in instiki-0.9.2 vs app/models/chunks/uri.rb in instiki-0.10.0

- old
+ new

@@ -1,97 +1,182 @@ -require 'chunks/chunk' - -# This wiki chunk matches arbitrary URIs, using patterns from the Ruby URI modules. -# It parses out a variety of fields that could be used by renderers to format -# the links in various ways (shortening domain names, hiding email addresses) -# It matches email addresses and host.com.au domains without schemes (http://) -# but adds these on as required. -# -# The heuristic used to match a URI is designed to err on the side of caution. -# That is, it is more likely to not autolink a URI than it is to accidently -# autolink something that is not a URI. The reason behind this is it is easier -# to force a URI link by prefixing 'http://' to it than it is to escape and -# incorrectly marked up non-URI. -# -# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes. -# The generic names are from www.bnoack.com/data/countrycode2.html) -# [iso3166]: http://geotags.com/iso3166/ -class URIChunk < Chunk::Abstract - include URI::REGEXP::PATTERN - - GENERIC = '(?:aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org)' - COUNTRY = '(?:au|at|be|ca|ch|de|dk|fr|hk|in|ir|it|jp|nl|no|pt|ru|se|sw|tv|tw|uk|us)' - - # These are needed otherwise HOST will match almost anything - TLDS = "\\.(?:#{GENERIC}|#{COUNTRY})" - - # Redefine USERINFO so that it must have non-zero length - USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+" - - # Pattern of legal URI endings to stop interference with some Textile - # markup. (Images: !URI!) and other punctuation eg, (http://wiki.com/) - URI_ENDING = '[)!]' - - # The basic URI expression as a string - URI_PATTERN = - "(?:(#{SCHEME})://)?" + # Optional scheme:// (\1|\8) - "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2|\9) - "(#{HOSTNAME}#{TLDS})" + # Mandatory host eg, HOST.com.au (\3|\10) - "(?::(#{PORT}))?" + # Optional :port (\4|\11) - "(#{ABS_PATH})?" + # Optional absolute path (\5|\12) - "(?:\\?(#{QUERY}))?" + # Optional ?query (\6|\13) - "(?:\\#(#{FRAGMENT}))?" # Optional #fragment (\7|\14) - - def self.pattern() - # This pattern first tries to match the URI_PATTERN that ends with - # punctuation that is a valid URI character (eg, ')', '!'). If - # such a match occurs, there should be no backtracking (hence the ?> ). - # If the string cannot match a URI ending with URI_ENDING, then a second - # attempt is tried. - Regexp.new("(?>#{URI_PATTERN}(?=#{URI_ENDING}))|#{URI_PATTERN}", Regexp::EXTENDED, 'N') - end - - attr_reader :uri, :scheme, :user, :host, :port, :path, :query, :fragment, :link_text - - def initialize(match_data) - super(match_data) - # Since the URI_PATTERN is tried twice, there are two sets of - # groups, one from \1 to \7 and the second from \8 to \14. - # The fields are set by which ever group matches. - @scheme = match_data[1] || match_data[8] - @user = match_data[2] || match_data[9] - @host = match_data[3] || match_data[10] - @port = match_data[4] || match_data[11] - @path = match_data[5] || match_data[12] - @query = match_data[6] || match_data[13] - @fragment = match_data[7] || match_data[14] - - # If there is no scheme, add an appropriate one, otherwise - # set the URI to the matched text. - @text_scheme = scheme - @uri = (scheme ? match_data[0] : nil ) - @scheme = scheme || ( user ? 'mailto' : 'http' ) - @delimiter = ( scheme == 'mailto' ? ':' : '://' ) - @uri ||= scheme + @delimiter + match_data[0] - - # Build up the link text. Schemes are omitted unless explicitly given. - @link_text = '' - @link_text << "#{@scheme}#{@delimiter}" if @text_scheme - @link_text << "#{@user}@" if @user - @link_text << "#{@host}" if @host - @link_text << ":#{@port}" if @port - @link_text << "#{@path}" if @path - @link_text << "?#{@query}" if @query - end - - # If the text should be escaped then don't keep this chunk. - # Otherwise only keep this chunk if it was substituted back into the - # content. - def unmask(content) - return nil if escaped_text - return self if content.sub!( Regexp.new(mask(content)), "<a href=\"#{uri}\">#{link_text}</a>" ) - end - - # If there is no hostname in the URI, do not render it - # It's probably only contains the scheme, eg 'something:' - def escaped_text() ( host.nil? ? @uri : nil ) end -end +require 'chunks/chunk' + +# This wiki chunk matches arbitrary URIs, using patterns from the Ruby URI modules. +# It parses out a variety of fields that could be used by renderers to format +# the links in various ways (shortening domain names, hiding email addresses) +# It matches email addresses and host.com.au domains without schemes (http://) +# but adds these on as required. +# +# The heuristic used to match a URI is designed to err on the side of caution. +# That is, it is more likely to not autolink a URI than it is to accidently +# autolink something that is not a URI. The reason behind this is it is easier +# to force a URI link by prefixing 'http://' to it than it is to escape and +# incorrectly marked up non-URI. +# +# I'm using a part of the [ISO 3166-1 Standard][iso3166] for country name suffixes. +# The generic names are from www.bnoack.com/data/countrycode2.html) +# [iso3166]: http://geotags.com/iso3166/ + +class URIChunk < Chunk::Abstract + include URI::REGEXP::PATTERN + + # this condition is to get rid of pesky warnings in tests + unless defined? URIChunk::INTERNET_URI_REGEXP + + GENERIC = 'aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org' + + COUNTRY = 'ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|az|ba|bb|bd|be|' + + 'bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cf|cd|cg|ch|ci|ck|cl|' + + 'cm|cn|co|cr|cs|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|fi|' + + 'fj|fk|fm|fo|fr|fx|ga|gb|gd|ge|gf|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|' + + 'hk|hm|hn|hr|ht|hu|id|ie|il|in|io|iq|ir|is|it|jm|jo|jp|ke|kg|kh|ki|km|kn|' + + 'kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|mg|mh|mk|ml|mm|' + + 'mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nt|' + + 'nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pt|pw|py|qa|re|ro|ru|rw|sa|sb|sc|' + + 'sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|' + + 'tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|' + + 'ws|ye|yt|yu|za|zm|zr|zw' + # These are needed otherwise HOST will match almost anything + TLDS = "(?:#{GENERIC}|#{COUNTRY})" + + # Redefine USERINFO so that it must have non-zero length + USERINFO = "(?:[#{UNRESERVED};:&=+$,]|#{ESCAPED})+" + + # unreserved_no_ending = alphanum | mark, but URI_ENDING [)!] excluded + UNRESERVED_NO_ENDING = "-_.~*'(#{ALNUM}" + + # this ensures that query or fragment do not end with URI_ENDING + # and enable us to use a much simpler self.pattern Regexp + + # uric_no_ending = reserved | unreserved_no_ending | escaped + URIC_NO_ENDING = "(?:[#{UNRESERVED_NO_ENDING}#{RESERVED}]|#{ESCAPED})" + # query = *uric + QUERY = "#{URIC_NO_ENDING}*" + # fragment = *uric + FRAGMENT = "#{URIC_NO_ENDING}*" + + # DOMLABEL is defined in the ruby uri library, TLDS is defined above + INTERNET_HOSTNAME = "(?:#{DOMLABEL}\\.)+#{TLDS}" + + # Correct a typo bug in ruby 1.8.x lib/uri/common.rb + PORT = '\\d*' + + INTERNET_URI = + "(?:(#{SCHEME}):/{0,2})?" + # Optional scheme: (\1) + "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2) + "(#{INTERNET_HOSTNAME})" + # Mandatory hostname (\3) + "(?::(#{PORT}))?" + # Optional :port (\4) + "(#{ABS_PATH})?" + # Optional absolute path (\5) + "(?:\\?(#{QUERY}))?" + # Optional ?query (\6) + "(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7) + '(?=\.?(?:\s|\)|\z))' # ends only with optional dot + space or ")" + # or end of the string + + SUSPICIOUS_PRECEDING_CHARACTER = '(!|\"\:|\"|\\\')?' # any of !, ":, ", ' + + INTERNET_URI_REGEXP = + Regexp.new(SUSPICIOUS_PRECEDING_CHARACTER + INTERNET_URI, Regexp::EXTENDED, 'N') + + end + + def URIChunk.pattern + INTERNET_URI_REGEXP + end + + attr_reader :user, :host, :port, :path, :query, :fragment, :link_text + + def self.apply_to(content) + content.gsub!( self.pattern ) do |matched_text| + chunk = self.new($~, content) + if chunk.avoid_autolinking? + # do not substitute nor register the chunk + matched_text + else + content.add_chunk(chunk) + chunk.mask + end + end + end + + def initialize(match_data, content) + super + @link_text = match_data[0] + @suspicious_preceding_character = match_data[1] + @original_scheme, @user, @host, @port, @path, @query, @fragment = match_data[2..-1] + treat_trailing_character + @unmask_text = "<a href=\"#{uri}\">#{link_text}</a>" + end + + def avoid_autolinking? + not @suspicious_preceding_character.nil? + end + + def treat_trailing_character + # If the last character matched by URI pattern is in ! or ), this may be part of the markup, + # not a URL. We should handle it as such. It is possible to do it by a regexp, but + # much easier to do programmatically + last_char = @link_text[-1..-1] + if last_char == ')' or last_char == '!' + @trailing_punctuation = last_char + @link_text.chop! + [@original_scheme, @user, @host, @port, @path, @query, @fragment].compact.last.chop! + else + @trailing_punctuation = nil + end + end + + def scheme + @original_scheme or (@user ? 'mailto' : 'http') + end + + def scheme_delimiter + scheme == 'mailto' ? ':' : '://' + end + + def user_delimiter + '@' unless @user.nil? + end + + def port_delimiter + ':' unless @port.nil? + end + + def query_delimiter + '?' unless @query.nil? + end + + def uri + [scheme, scheme_delimiter, user, user_delimiter, host, port_delimiter, port, path, + query_delimiter, query].compact.join + end + +end + +# uri with mandatory scheme but less restrictive hostname, like +# http://localhost:2500/blah.html +class LocalURIChunk < URIChunk + + unless defined? LocalURIChunk::LOCAL_URI_REGEXP + # hostname can be just a simple word like 'localhost' + ANY_HOSTNAME = "(?:#{DOMLABEL}\\.)*#{TOPLABEL}\\.?" + + # The basic URI expression as a string + # Scheme and hostname are mandatory + LOCAL_URI = + "(?:(#{SCHEME})://)+" + # Mandatory scheme:// (\1) + "(?:(#{USERINFO})@)?" + # Optional userinfo@ (\2) + "(#{ANY_HOSTNAME})" + # Mandatory hostname (\3) + "(?::(#{PORT}))?" + # Optional :port (\4) + "(#{ABS_PATH})?" + # Optional absolute path (\5) + "(?:\\?(#{QUERY}))?" + # Optional ?query (\6) + "(?:\\#(#{FRAGMENT}))?" + # Optional #fragment (\7) + '(?=\.?(?:\s|\)|\z))' # ends only with optional dot + space or ")" + # or end of the string + + LOCAL_URI_REGEXP = Regexp.new(SUSPICIOUS_PRECEDING_CHARACTER + LOCAL_URI, Regexp::EXTENDED, 'N') + end + + def LocalURIChunk.pattern + LOCAL_URI_REGEXP + end + +end