# Copyright 2014 Bob Aman
#
#    Licensed under the Apache License, Version 2.0 (the "License");
#    you may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.


require 'mime/types'
require 'faraday'
require 'nokogiri'
require 'sanitize'
require 'fastimage'
require 'href_preview/fastimage_uri'
require 'time'

module HRefPreview
  class Preview
    ##
    # Initializes a `Preview` from an HTTP response.
    def initialize(response, connection=DEFAULT_CONNECTION)
      @response = response
      @connection = connection
    end

    attr_reader :response

    ##
    # Returns the MIME type declared in the HTTP headers or HTML meta
    # tags.
    #
    # @return [MIME::Type] The MIME type of the HTTP response.
    def mime_type
      @mime_type ||= (begin
        MIME::Types[response.headers['Content-Type']].first or
        begin
          node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
          MIME::Types[node.value].first if node && node.value
        end or
        begin
          node = dom.xpath("//*/meta[@name='dc.format']/@content").first
          MIME::Types[node.value].first if node && node.value
        end
      end)
    end

    ##
    # Returns the charset declared in the HTTP headers or HTML meta
    # tags.
    #
    # @return [String] The charset of the HTTP response.
    def charset
      @charset ||= (begin
        charset = response.headers['Content-Type'].to_s[/;\s*charset=([^;,]*)/, 1] or
        begin
          node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first
          node.value.to_s[/;\s*charset=([^;,]*)/, 1] if node
        end or
        begin
          node = dom.xpath("//*/meta/@charset").first
          node.value if node
        end
        charset.strip if charset
      end)
    end

    ##
    # @returns [String] The two-letter language code for the content.
    def language
      @language ||= (begin
        language = response.headers['Content-Language'] or
        begin
          node = dom.xpath("//*/meta[@http-equiv='Content-Language']/@content").first
          node.value if node
        end or
        begin
          node = dom.xpath("//*/meta[@name='dc.language']/@content").first
          node.value if node
        end
        if language
          # Strip the irrelevant '-US' from 'en-US' if it appears.
          language[/^([a-z]{2})/, 1].to_s.downcase
        end
      end)
    end

    ##
    # Returns true if the response had a 2xx HTTP code and the mime type
    # is either HTML or XHTML.
    #
    # @return [true, false] Whether successful HTML response or not.
    def is_html?
      return (
        response.status >= 200 && response.status < 300 &&
        mime_type && mime_type.sub_type =~ /^x?html/
      )
    end

    ##
    # The DOM for the response body.
    #
    # @return [Nokogiri::HTML::Document]
    #   The DOM, as generated by Nokogiri.
    def dom
      @dom ||= Nokogiri::HTML(response.body)
    end

    ##
    # @return [String] The title of the page.
    def title
      @title ||= (begin
        if is_html?
          title = begin
            node = dom.xpath("//*/meta[@property='og:title']/@content").first
            node.value if node
          end or
          begin
            node = dom.xpath("//*/meta[@name='dc.title']/@content").first
            node.value if node
          end or
          begin
            if article_node
              node = article_node.xpath("*[@itemprop='headline']").first
              node.text if node
            end
          end or
          begin
            node = dom.xpath("//*/*[(self::h1 or self::h2) and @itemprop='headline']").first
            node.text if node
          end or
          begin
            node = dom.xpath("//*/head/title").first
            node.text if node
          end or
          begin
            # Unlikely to ever happen
            node = dom.xpath("//*/meta[@name='twitter:title']/@content").first
            node.value if node
          end or
          begin
            # Unlikely to ever happen
            node = dom.xpath("//*/meta[@name='sailthru.title']/@content").first
            node.value if node
          end
          if title
            title.gsub!(/&nbsp;/, ' ')
            title.gsub!(/^#{site_name}[\s\|\-\:]*/, '')
            title.gsub!(/[\s\|\-\:]*#{site_name}$/, '')
            title.strip
          end
        end
      end)
    end

    def description
      @description ||= (begin
        if is_html?
          description = begin
            node = dom.xpath("//*/meta[@property='og:description']/@content").first
            node.value if node
          end or
          begin
            node = dom.xpath("//*/meta[@name='dc.description']/@content").first
            node.value if node
          end or
          begin
            node = dom.xpath("//*/meta[@itemprop='description']/@content").first
            node.value if node
          end or
          begin
            node = dom.xpath("//*/meta[@name='description']/@content").first
            node.value if node
          end or
          begin
            node = dom.xpath("//*/meta[@name='dcterms.abstract']/@content").first
            node.value if node
          end or
          begin
            # Unlikely to ever happen
            node = dom.xpath("//*/meta[@name='twitter:description']/@content").first
            node.value if node
          end or
          begin
            # Unlikely to ever happen
            node = dom.xpath("//*/meta[@name='sailthru.description']/@content").first
            node.value if node
          end
          if description
            description.gsub!(/&nbsp;/, ' ')
            description.strip
          end
        end
      end)
    end

    def canonical_uri
      @canonical_uri ||= (if is_html?
        begin
          node = dom.xpath("//*/link[@rel='canonical']/@href").first
          Addressable::URI.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("//*/meta[@property='og:url']/@content").first
          Addressable::URI.parse(node.value) if node && node.value && node.value != ''
        end or
        Addressable::URI.parse(response.env.url.to_s)
      else
        Addressable::URI.parse(response.env.url.to_s)
      end)
    end

    def shortlink_uri
      @shortlink_uri ||= (if is_html?
        begin
          node = dom.xpath("//*/link[@rel='shortlink']/@href").first
          Addressable::URI.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("//*[@class='story-short-url']/a/@href").first
          Addressable::URI.parse(node.value) if node && node.value && node.value != ''
        end
      end)
    end

    def image_uri
      @image_uri ||= (images.first ? Addressable::URI.parse(images.first.uri) : nil)
    end

    def images
      @images ||= (begin
        image_uris = []
        if is_html?
          nodes = dom.xpath("//*/meta[@property='og:image']/@content")
          nodes.each do |node|
            if node && node.value && node.value != ''
              image_uris << Addressable::URI.parse(node.value)
            end
          end
          if article_node
            nodes = article_node.xpath("meta[@itemprop='thumbnailurl']/@content")
            nodes.each do |node|
              if node && node.value && node.value != ''
                image_uris << Addressable::URI.parse(node.value)
              end
            end
          end
        elsif mime_type && mime_type.media_type == 'image'
          image_uris << canonical_uri
        end
        image_uris.uniq.map { |uri| FastImage.new(uri, :timeout => 0.5) }
      end)
    end

    def item_type
      @item_type ||= (if is_html?
        begin
          node = dom.xpath("//*/meta[@property='og:type']/@content").first
          node.value if node
        end or
        if dom.xpath("//*[@itemtype='http://schema.org/NewsArticle']").first != nil
          'article'
        end
      end)
    end

    def site_name
      @site_name ||= (if is_html?
        begin
          node = dom.xpath("//*/meta[@property='og:site_name']/@content").first
          node.value if node
        end or
        begin
          node = dom.xpath("//*/meta[@name='dc.publisher']/@content").first
          node.value if node
        end
      end)
    end

    ##
    # @return [String] The Twitter handle used by the site.
    def twitter
      @twitter ||= (if is_html?
        begin
          node = dom.xpath("//*/meta[@name='twitter:site']/@content").first
          node.value if node && node.value && node.value =~ /^@/
        end
      end)
    end

    def article_node
      @article_node ||= (if is_html?
        begin
          nodes = dom.xpath("/html[@itemtype='http://schema.org/NewsArticle']//article[@id='story']")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.xpath("//*/*[@itemtype='http://schema.org/NewsArticle']")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.xpath("//*/*[@itemprop='articleBody']")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.css("article div.article-entry")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.css("article.post div.entry-content")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.css("div.post div.postBody")
          nodes.first if nodes.size == 1
        end or
        begin
          nodes = dom.css(".pg_story div#leftcolumn div.body")
          nodes.first if nodes.size == 1
        end
      end)
    end

    options = Sanitize::Config::RELAXED.merge(
      :remove_contents => true,
      :elements => %w[
        a abbr address b bdi bdo blockquote br caption cite code col colgroup dd
        del dfn dl dt em figcaption figure h1 h2 h3 h4 h5 h6 hgroup hr i img ins
        kbd li mark ol p pre q rp rt ruby s samp small span strike strong sub
        summary sup table tbody td tfoot th thead time tr u ul var wbr
      ]
    )
    options[:attributes]['span'] = []
    SANITIZE_OPTIONS = options

    def article_html
      @article_html ||= (if is_html?
        begin
          html = nil
          if article_node
            html = article_node.children.reject do |child|
              next unless child.attribute('class')
              [
                'related_links_inline',
                'inline-share-btn-label',
                'inline-share-btn'
              ].include?(child.attribute('class').value)
            end.map(&:to_s).join('')
          end
          if html
            html = Sanitize.clean(html, SANITIZE_OPTIONS)
            html.gsub!("\r\n", "\n")
            html.gsub!("\t", "  ")
            html.gsub!(/ *\n */, "\n")
            html.gsub!(/\n\n+/, "\n\n")
            html.gsub!(/<p>\n+/, "<p>\n")
            html.gsub!(/\n+<\/p>/, "\n</p>")
            html.gsub!(/<\/p>\n+/, "</p>\n")
            html.strip!

            # Excise empty elements
            reparsed = Nokogiri::HTML.fragment(html)
            excise_empty = lambda do |node|
              if node.respond_to?(:name) && node.name == "script"
                node.unlink
              else
                node.children.each do |node|
                  excise_empty.call(node) if node.element?
                end
                if node.respond_to?(:attribute_nodes) && node.respond_to?(:text)
                  if node.attribute_nodes.size == 0 && node.text.to_s.strip =~ /^\s*$/ &&
                      node.children.all? { |child| child.text? }
                    node.unlink
                  end
                end
              end
            end
            excise_empty.call(reparsed)
            html = reparsed.to_s
          end
          html
        end
      end)
    end

    def article_text
      @article_text ||= is_html? ? Sanitize.clean(article_html) : nil
    end

    def published
      @published ||= (begin
        # Check under the article node first, otherwise search all
        begin
          node = dom.xpath("//*/meta[@property='article:published_time']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        if article_node
          node = article_node.xpath("meta[@itemprop='datepublished']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("//*/meta[@itemprop='datepublished']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("//*/meta[@name='dcterms.created']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          # Only a date, not a time, and not particularly specific,
          # so this is a fallback at best.
          node = dom.xpath("//*/meta[@name='dc.date']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end
      end)
    end

    def updated
      @updated ||= (begin
        # Check under the article node first, otherwise search all
        begin
          node = dom.xpath("//*/meta[@property='article:modified_time']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        if article_node
          node = article_node.xpath("meta[@itemprop='datemodified']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("meta[@itemprop='datemodified']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end or
        begin
          node = dom.xpath("//*/meta[@name='dcterms.modified']/@content").first
          Time.parse(node.value) if node && node.value && node.value != ''
        end
      end)
    end

    def inspect
      addr = '0x' + ('%x' % (object_id << 1)).rjust(14, '0')
      "#<HRefPreview::Preview:#{addr} TITLE=#{title.inspect}>"
    end
  end
end