lib/postrank-uri.rb in postrank-uri-1.0.18 vs lib/postrank-uri.rb in postrank-uri-1.0.20

- old
+ new

@@ -1,17 +1,17 @@ - +# encoding: utf-8 require 'addressable/uri' require 'digest/md5' require 'nokogiri' require 'public_suffix' require 'yaml' module Addressable class URI def domain host = self.host - (host && PublicSuffix.valid?(host)) ? PublicSuffix.parse(host).domain : nil + (host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil end def normalized_query @normalized_query ||= (begin if self.query && self.query.strip != '' @@ -84,22 +84,23 @@ (\?#{URIREGEX[:valid_url_query_chars]}*#{URIREGEX[:valid_url_query_ending_chars]})? ) ) }iox; + URIREGEX[:reserved_characters] = /%3F|%26/i URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x - URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x + URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x URIREGEX.each_pair{|k,v| v.freeze } module_function def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| # Only extract the URL if the domain is valid - if PublicSuffix.valid?(domain) + if PublicSuffix.valid?(domain, default_rule: nil) url = clean(url) urls.push url.to_s end end @@ -129,12 +130,16 @@ end def unescape(uri) u = parse(uri) u.query = u.query.tr('+', ' ') if u.query - u.to_s.gsub(URIREGEX[:unescape]) do - [$1.delete('%')].pack('H*') + u.to_s.gsub(URIREGEX[:unescape]) do |encoded| + if encoded.match? URIREGEX[:reserved_characters] + encoded + else + [encoded.delete('%')].pack('H*') + end end end def clean(uri, opts = {}) uri = normalize(c14n(unescape(uri), opts)) @@ -223,10 +228,10 @@ is_valid = false cleaned_uri = clean(uri, :raw => true) if host = cleaned_uri.host - is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host)) + is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil) end is_valid end end