lib/postrank-uri.rb in postrank-uri-1.0.0 vs lib/postrank-uri.rb in postrank-uri-1.0.1
- old
+ new
@@ -1,9 +1,10 @@
# -*- encoding: utf-8 -*-
require 'addressable/uri'
require 'domainatrix'
+require 'nokogiri'
require 'yaml'
module PostRank
module URI
@@ -62,11 +63,13 @@
URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x
URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x
URIREGEX.each_pair{|k,v| v.freeze }
- def self.extract(text)
+ module_function
+
+ def extract(text)
return [] if !text
urls = []
text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
begin
url = clean(url).to_s
@@ -77,35 +80,53 @@
end
urls.compact
end
- def self.escape(uri)
+ def extract_href(text, host = nil)
+ urls = {}
+ Nokogiri.HTML(text).search('a').each do |a|
+ begin
+ url = normalize(c18n(unescape(a.attr('href'))))
+ if url.host.empty?
+ next if host.nil?
+ url.host = host
+ end
+
+ urls[url.to_s] = a.text
+ rescue
+ next
+ end
+ end
+ urls
+ end
+
+ def escape(uri)
uri.gsub(URIREGEX[:escape]) do
'%' + $1.unpack('H2' * $1.size).join('%').upcase
end.gsub(' ','%20')
end
- def self.unescape(uri)
+ def unescape(uri)
uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do
[$1.delete('%')].pack('H*')
end
end
- def self.clean(uri)
+ def clean(uri)
normalize(c18n(unescape(uri))).to_s
end
- def self.normalize(uri)
+ def normalize(uri)
u = parse(uri)
u.path = u.path.squeeze('/')
u.query = nil if u.query && u.query.empty?
u.fragment = nil
u
end
- def self.c18n(uri)
+ def c18n(uri)
u = parse(uri)
if q = u.query_values(:notation => :flat_array)
q.delete_if { |k,v| C18N[:global].include?(k) }
q.delete_if { |k,v| C18N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
@@ -113,10 +134,10 @@
u.query_values = q
u
end
- def self.parse(uri)
+ def parse(uri)
return uri if uri.is_a? Addressable::URI
uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}"
Addressable::URI.parse(uri).normalize
end
\ No newline at end of file