# -*- encoding: utf-8 -*- require 'addressable/uri' require 'domainatrix' require 'nokogiri' require 'yaml' module PostRank module URI c18ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c18n.yml') C18N = {} C18N[:global] = c18ndb[:all].freeze C18N[:hosts] = c18ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h} URIREGEX = {} URIREGEX[:protocol] = /https?:\/\//i URIREGEX[:valid_preceding_chars] = /(?:|\.|[^-\/"':!=A-Z0-9_@@]|^|\:)/i URIREGEX[:valid_domain] = /(?:[^[:punct:]\s][\.-](?=[^[:punct:]\s])|[^[:punct:]\s]){1,}\.[a-z]{2,}(?::[0-9]+)?/i URIREGEX[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ URIREGEX[:wikipedia_disambiguation] = /(?:\(#{URIREGEX[:valid_general_url_path_chars]}+\))/i # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user URIREGEX[:valid_url_path_chars] = /(?: #{URIREGEX[:wikipedia_disambiguation]}| @#{URIREGEX[:valid_general_url_path_chars]}+\/| [\.,]#{URIREGEX[:valid_general_url_path_chars]}+| #{URIREGEX[:valid_general_url_path_chars]}+ )/ix # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts URIREGEX[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{URIREGEX[:wikipedia_disambiguation]}/io URIREGEX[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i URIREGEX[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i URIREGEX[:valid_url] = %r{ ( # $1 total match (#{URIREGEX[:valid_preceding_chars]}) # $2 Preceeding chracter ( # $3 URL (https?:\/\/)? # $4 Protocol (#{URIREGEX[:valid_domain]}) # $5 Domain(s) and optional post number (/ (?: # 1+ path chars and a valid last char #{URIREGEX[:valid_url_path_chars]}+#{URIREGEX[:valid_url_path_ending_chars]}| # Optional last char to handle /@foo/ case #{URIREGEX[:valid_url_path_chars]}+#{URIREGEX[:valid_url_path_ending_chars]}?| # Just a # case #{URIREGEX[:valid_url_path_ending_chars]} )? )? # $6 URL Path and anchor # $7 Query String (\?#{URIREGEX[:valid_url_query_chars]}*#{URIREGEX[:valid_url_query_ending_chars]})? ) ) }iox; URIREGEX[:escape] = /([^ a-zA-Z0-9_.-]+)/x URIREGEX[:unescape] = /((?:%[0-9a-fA-F]{2})+)/x URIREGEX.each_pair{|k,v| v.freeze } module_function def extract(text) return [] if !text urls = [] text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query| begin url = clean(url).to_s Domainatrix.parse(url) urls.push url rescue NoMethodError end end urls.compact end def extract_href(text, host = nil) urls = [] Nokogiri.HTML(text).search('a').each do |a| begin url = normalize(c18n(unescape(a.attr('href')))) if url.host.empty? next if host.nil? url.host = host end urls.push [url.to_s, a.text] rescue next end end urls end def escape(uri) uri.gsub(URIREGEX[:escape]) do '%' + $1.unpack('H2' * $1.size).join('%').upcase end.gsub(' ','%20') end def unescape(uri) uri.tr('+', ' ').gsub(URIREGEX[:unescape]) do [$1.delete('%')].pack('H*') end end def clean(uri) normalize(c18n(unescape(uri))).to_s end def normalize(uri) u = parse(uri) u.path = u.path.squeeze('/') u.query = nil if u.query && u.query.empty? u.fragment = nil u end def c18n(uri) u = parse(uri) if q = u.query_values(:notation => :flat_array) q.delete_if { |k,v| C18N[:global].include?(k) } q.delete_if { |k,v| C18N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } } end u.query_values = q u end def parse(uri) return uri if uri.is_a? Addressable::URI uri = uri.index(URIREGEX[:protocol]) == 0 ? uri : "http://#{uri}" Addressable::URI.parse(uri).normalize end end end