lib/postrank-uri.rb in postrank-uri-1.0.0 vs lib/postrank-uri.rb in postrank-uri-1.0.1

- old
+ new

@@ -1,9 +1,10 @@ # -*- encoding: utf-8 -*- require 'addressable/uri' require 'domainatrix' +require 'nokogiri' require 'yaml' module PostRank module URI @@ -62,11 +63,13 @@ URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x URIREGEX.each_pair{|k,v| v.freeze } - def self.extract(text) + module_function + + def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| begin url = clean(url).to_s @@ -77,35 +80,53 @@ end urls.compact end - def self.escape(uri) + def extract_href(text, host = nil) + urls = {} + Nokogiri.HTML(text).search('a').each do |a| + begin + url = normalize(c18n(unescape(a.attr('href')))) + if url.host.empty? + next if host.nil? + url.host = host + end + + urls[url.to_s] = a.text + rescue + next + end + end + urls + end + + def escape(uri) uri.gsub(URIREGEX[:escape]) do '%' + $1.unpack('H2' * $1.size).join('%').upcase end.gsub(' ','%20') end - def self.unescape(uri) + def unescape(uri) uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do [$1.delete('%')].pack('H*') end end - def self.clean(uri) + def clean(uri) normalize(c18n(unescape(uri))).to_s end - def self.normalize(uri) + def normalize(uri) u = parse(uri) u.path = u.path.squeeze('/') u.query = nil if u.query && u.query.empty? u.fragment = nil u end - def self.c18n(uri) + def c18n(uri) u = parse(uri) if q = u.query_values(:notation => :flat_array) q.delete_if { |k,v| C18N[:global].include?(k) } q.delete_if { |k,v| C18N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } } @@ -113,10 +134,10 @@ u.query_values = q u end - def self.parse(uri) + def parse(uri) return uri if uri.is_a? Addressable::URI uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}" Addressable::URI.parse(uri).normalize end \ No newline at end of file