lib/postrank-uri.rb in postrank-uri-1.0.18 vs lib/postrank-uri.rb in postrank-uri-1.0.20
- old
+ new
@@ -1,17 +1,17 @@
-
+# encoding: utf-8
require 'addressable/uri'
require 'digest/md5'
require 'nokogiri'
require 'public_suffix'
require 'yaml'
module Addressable
class URI
def domain
host = self.host
- (host && PublicSuffix.valid?(host)) ? PublicSuffix.parse(host).domain : nil
+ (host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil
end
def normalized_query
@normalized_query ||= (begin
if self.query && self.query.strip != ''
@@ -84,22 +84,23 @@
(\?#{URIREGEX[:valid_url_query_chars]}*#{URIREGEX[:valid_url_query_ending_chars]})?
)
)
}iox;
+ URIREGEX[:reserved_characters] = /%3F|%26/i
URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x
- URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
+ URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x
URIREGEX.each_pair{|k,v| v.freeze }
module_function
def extract(text)
return [] if !text
urls = []
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
# Only extract the URL if the domain is valid
- if PublicSuffix.valid?(domain)
+ if PublicSuffix.valid?(domain, default_rule: nil)
url = clean(url)
urls.push url.to_s
end
end
@@ -129,12 +130,16 @@
end
def unescape(uri)
u = parse(uri)
u.query = u.query.tr('+', ' ') if u.query
- u.to_s.gsub(URIREGEX[:unescape]) do
- [$1.delete('%')].pack('H*')
+ u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
+ if encoded.match? URIREGEX[:reserved_characters]
+ encoded
+ else
+ [encoded.delete('%')].pack('H*')
+ end
end
end
def clean(uri, opts = {})
uri = normalize(c14n(unescape(uri), opts))
@@ -223,10 +228,10 @@
is_valid = false
cleaned_uri = clean(uri, :raw => true)
if host = cleaned_uri.host
- is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host))
+ is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
end
is_valid
end
end