lib/postrank-uri.rb in postrank-uri-1.0.16 vs lib/postrank-uri.rb in postrank-uri-1.0.17

- old
+ new

@@ -1,24 +1,18 @@ # -*- encoding: utf-8 -*- require 'addressable/uri' -require 'domainatrix' require 'digest/md5' require 'nokogiri' +require 'public_suffix' require 'yaml' module Addressable class URI def domain - begin - dp = Domainatrix.parse(self) - rescue - return nil - end - - dom = dp.public_suffix - dom = dp.domain.downcase + "." + dom unless dp.domain.empty? + host = self.host + (host && PublicSuffix.valid?(host)) ? PublicSuffix.parse(host).domain : nil end def normalized_query @normalized_query ||= (begin if self.query && self.query.strip != '' @@ -101,15 +95,14 @@ def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| - begin + # Only extract the URL if the domain is valid + if PublicSuffix.valid?(domain) url = clean(url) - Domainatrix.parse(url) urls.push url.to_s - rescue NoMethodError end end urls.compact end @@ -221,12 +214,20 @@ uri.scheme = 'http' if uri.host && !uri.scheme uri.normalize! end def valid?(uri) - Domainatrix.parse(uri) - true - rescue - false + # URI is only valid if it is not nil, parses cleanly as a URI, + # and the domain has a recognized, valid TLD component + return false if uri.nil? + + is_valid = false + cleaned_uri = clean(uri, :raw => true) + + if host = cleaned_uri.host + is_valid = PublicSuffix.valid?(host) + end + + is_valid end end end