# encoding: utf-8
require 'addressable/uri'
require 'digest/md5'
require 'nokogiri'
require 'public_suffix'
require 'yaml'

module Addressable
  class URI
    def domain
      host = self.host
      (host && PublicSuffix.valid?(host, default_rule: nil)) ? PublicSuffix.parse(host).domain : nil
    end

    def normalized_query
      @normalized_query ||= (begin
        if self.query && self.query.strip != ''
          (self.query.strip.split("&", -1).map do |pair|
            Addressable::URI.normalize_component(
              pair,
              Addressable::URI::CharacterClasses::QUERY.sub("\\&", "")
            )
          end).join("&")
        else
          nil
        end
      end)
    end

  end
end

module PostRank
  module URI

    c14ndb = YAML.load_file(File.dirname(__FILE__) + '/postrank-uri/c14n.yml')

    C14N = {}
    C14N[:global] = c14ndb[:all].freeze
    C14N[:hosts]  = c14ndb[:hosts].inject({}) {|h,(k,v)| h[/#{Regexp.escape(k)}$/.freeze] = v; h}

    URIREGEX = {}
    URIREGEX[:protocol] = /https?:\/\//i
    URIREGEX[:valid_preceding_chars] = /(?:|\.|[^-\/"':!=A-Z0-9_@＠]|^|\:)/i
    URIREGEX[:valid_domain] = /\b(?:[a-z0-9-]{1,63}\.){1,}[a-z]{2,63}(?::[0-9]+)?/i
    URIREGEX[:valid_general_url_path_chars] = /[a-z0-9!\*';:=\+\,\$\/%#\[\]\-_~]/i

    # Allow URL paths to contain balanced parens
    #  1. Used in Wikipedia URLs like /Primer_(film)
    #  2. Used in IIS sessions like /S(dfd346)/
    URIREGEX[:wikipedia_disambiguation] = /(?:\(#{URIREGEX[:valid_general_url_path_chars]}+\))/i

    # Allow @ in a url, but only in the middle. Catch things like http://example.com/@user
    URIREGEX[:valid_url_path_chars] = /(?:
      #{URIREGEX[:wikipedia_disambiguation]}|
      @#{URIREGEX[:valid_general_url_path_chars]}+\/|
      [\.,]#{URIREGEX[:valid_general_url_path_chars]}+|
      #{URIREGEX[:valid_general_url_path_chars]}+
    )/ix

    # Valid end-of-path chracters (so /foo. does not gobble the period).
    #   1. Allow =&# for empty URL parameters and other URL-join artifacts
    URIREGEX[:valid_url_path_ending_chars] = /[a-z0-9=_#\/\+\-]|#{URIREGEX[:wikipedia_disambiguation]}/io
    URIREGEX[:valid_url_query_chars] = /[a-z0-9!\*'\(\);:&=\+\$\/%#\[\]\-_\.,~]/i
    URIREGEX[:valid_url_query_ending_chars] = /[a-z0-9_&=#\/]/i

    URIREGEX[:valid_url] = %r{
          (                                               #   $1 total match
            (#{URIREGEX[:valid_preceding_chars]})         #   $2 Preceeding chracter
            (                                             #   $3 URL
              (https?:\/\/)?                              #   $4 Protocol
              (#{URIREGEX[:valid_domain]})                #   $5 Domain(s) and optional post number
              (/
                (?:
                  # 1+ path chars and a valid last char
                  #{URIREGEX[:valid_url_path_chars]}+#{URIREGEX[:valid_url_path_ending_chars]}|
                  # Optional last char to handle /@foo/ case
                  #{URIREGEX[:valid_url_path_chars]}+#{URIREGEX[:valid_url_path_ending_chars]}?|
                  # Just a # case
                  #{URIREGEX[:valid_url_path_ending_chars]}
                )?
              )?                                          #   $6 URL Path and anchor
              # $7 Query String
              (\?#{URIREGEX[:valid_url_query_chars]}*#{URIREGEX[:valid_url_query_ending_chars]})?
            )
          )
        }iox;

    URIREGEX[:reserved_characters] = /%3F|%26/i
    URIREGEX[:escape]   = /([^ a-zA-Z0-9_.-]+)/x
    URIREGEX[:unescape] = /(%[0-9a-fA-F]{2})/x
    URIREGEX[:double_slash_outside_scheme] = /(?<!http:|https:)\/{2}/x
    URIREGEX.each_pair{|k,v| v.freeze }

    module_function

    def extract(text)
      return [] if !text
      urls = []
      text.to_s.scan(URIREGEX[:valid_url]) do |all, before, url, protocol, domain, path, query|
        # Only extract the URL if the domain is valid
        if PublicSuffix.valid?(domain, default_rule: nil)
          url = clean(url)
          urls.push url.to_s
        end
      end

      urls.compact
    end

    def extract_href(text, host = nil)
      urls = []
      Nokogiri.HTML(text).search('a').each do |a|
        begin
          url = clean(a.attr('href'), :raw => true, :host => host)

          next unless url.absolute?

          urls.push [url.to_s, a.text]
        rescue
          next
        end
      end
      urls
    end

    def escape(uri)
      uri.gsub(URIREGEX[:escape]) do
        '%' + $1.unpack('H2' * $1.size).join('%').upcase
      end.gsub(' ','%20')
    end

    def unescape(uri)
      u = parse(uri)
      u.query = u.query.tr('+', ' ') if u.query
      u.to_s.gsub(URIREGEX[:unescape]) do |encoded|
        if !encoded.match(URIREGEX[:reserved_characters]).nil?
          encoded
        else
          [encoded.delete('%')].pack('H*')
        end
      end
    end

    def clean(uri, opts = {})
      uri = normalize(c14n(unescape(uri), opts))
      opts[:raw] ? uri : uri.to_s
    end

    def hash(uri, opts = {})
      Digest::MD5.hexdigest(opts[:clean] == true ? clean(uri) : uri)
    end

    def normalize(uri, opts = {})
      u = parse(uri, opts)
      u.path = u.path.gsub(URIREGEX[:double_slash_outside_scheme], '/')
      u.path = u.path.chomp('/') if u.path.size != 1
      u.query = nil if u.query && u.query.empty?
      u.fragment = nil
      u
    end

    def c14n(uri, opts = {})
      u = parse(uri, opts)
      u = embedded(u)

      if q = u.query_values(Array)
        q.delete_if { |k,v| C14N[:global].include?(k) }
        q.delete_if { |k,v| C14N[:hosts].find {|r,p| u.host =~ r && p.include?(k) } }
      end
      u.query_values = q

      if u.host =~ /^(mobile\.)?twitter\.com$/ && u.fragment && u.fragment.match(/^!(.*)/)
        u.fragment = nil
        u.path = $1
      end

      if u.host =~ /tumblr\.com$/ && u.path =~ /\/post\/\d+\//
        u.path = u.path.gsub(/[^\/]+$/, '')
      end

      u
    end

    def embedded(uri)
      embedded = if uri.host == 'news.google.com' && uri.path == '/news/url' \
         || uri.host == 'xfruits.com'
        uri.query_values['url']

      elsif uri.host =~ /myspace\.com/ && uri.path =~ /PostTo/
        embedded = uri.query_values['u']
      end

      uri = clean(embedded, :raw => true) if embedded
      uri
    end

    def parse(uri, opts = {})
      return uri if uri.is_a? Addressable::URI

      uri = Addressable::URI.parse(uri)

      if !uri.host && uri.scheme !~ /^javascript|mailto|xmpp$/
        if uri.scheme
          # With no host and scheme yes, the parser exploded
          return parse("http://#{uri}", opts)
        end

        if opts[:host]
          uri.host = opts[:host]
        else
          parts = uri.path.to_s.split(/[\/:]/)
          if parts.first =~ URIREGEX[:valid_domain]
            host = parts.shift
            uri.path = '/' + parts.join('/')
            uri.host = host
          end
        end
      end

      uri.scheme = 'http' if uri.host && !uri.scheme
      uri.normalize!
    end

    def valid?(uri)
      # URI is only valid if it is not nil, parses cleanly as a URI,
      # and the domain has a recognized, valid TLD component
      return false if uri.nil?

      is_valid = false
      cleaned_uri = clean(uri, :raw => true)

      if host = cleaned_uri.host
        is_valid = PublicSuffix.valid?(Addressable::IDNA.to_unicode(host), default_rule: nil)
      end

      is_valid
    end
  end
end