require 'spidr/extensions/uri'

require 'uri'
require 'nokogiri'

module Spidr
  class Page

    # URL of the page
    attr_reader :url

    # HTTP Response
    attr_reader :response

    # Headers returned with the body
    attr_reader :headers

    #
    # Creates a new Page object.
    #
    # @param [URI::HTTP] url
    #   The URL of the page.
    #
    # @param [Net::HTTP::Response] response
    #   The response from the request for the page.
    #
    def initialize(url,response)
      @url = url
      @response = response
      @headers = response.to_hash
      @doc = nil
    end

    #
    # The response code from the page.
    #
    # @return [Integer]
    #   Response code from the page.
    #
    def code
      @response.code.to_i
    end

    #
    # Determines if the response code is +200+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +200+.
    #
    def is_ok?
      code == 200
    end

    alias ok? is_ok?

    #
    # Determines if the response code is +301+ or +307+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +301+ or +307+.
    #
    def is_redirect?
      (code == 301 || code == 307)
    end

    alias redirect? is_redirect?

    #
    # Determines if the response code is +308+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +308+.
    #
    def timedout?
      code == 308
    end

    #
    # Determines if the response code is +400+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +400+.
    #
    def bad_request?
      code == 400
    end

    #
    # Determines if the response code is +401+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +401+.
    #
    def is_unauthorized?
      code == 401
    end

    alias unauthorized? is_unauthorized?

    #
    # Determines if the response code is +403+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +403+.
    #
    def is_forbidden?
      code == 403
    end

    alias forbidden? is_forbidden?

    #
    # Determines if the response code is +404+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +404+.
    #
    def is_missing?
      code == 404
    end

    alias missing? is_missing?

    #
    # Determines if the response code is +500+.
    #
    # @return [Boolean]
    #   Specifies whether the response code is +500+.
    #
    def had_internal_server_error?
      code == 500
    end

    #
    # The Content-Type of the page.
    #
    # @return [String]
    #   The Content-Type of the page.
    #
    def content_type
      @response['Content-Type']
    end

    #
    # Determines if the page is plain-text.
    #
    # @return [Boolean]
    #   Specifies whether the page is plain-text.
    #
    def plain_text?
      (content_type =~ /text\/plain/) == 0
    end

    alias txt? plain_text?

    #
    # Determines if the page is HTML document.
    #
    # @return [Boolean]
    #   Specifies whether the page is HTML document.
    #
    def html?
      (content_type =~ /text\/html/) == 0
    end

    #
    # Determines if the page is XML document.
    #
    # @return [Boolean]
    #   Specifies whether the page is XML document.
    #
    def xml?
      (content_type =~ /text\/xml/) == 0
    end

    #
    # Determines if the page is XML Stylesheet (XSL).
    #
    # @return [Boolean]
    #   Specifies whether the page is XML Stylesheet (XSL).
    #
    def xsl?
      (content_type =~ /text\/xsl/) == 0
    end

    #
    # Determines if the page is JavaScript.
    #
    # @return [Boolean]
    #   Specifies whether the page is JavaScript.
    #
    def javascript?
      (content_type =~ /(text|application)\/javascript/) == 0
    end

    #
    # Determines if the page is a CSS stylesheet.
    #
    # @return [Boolean]
    #   Specifies whether the page is a CSS stylesheet.
    #
    def css?
      (content_type =~ /text\/css/) == 0
    end

    #
    # Determines if the page is a RSS feed.
    #
    # @return [Boolean]
    #   Specifies whether the page is a RSS feed.
    #
    def rss?
      (content_type =~ /application\/(rss|rdf)\+xml/) == 0
    end

    #
    # Determines if the page is an Atom feed.
    #
    # @return [Boolean]
    #   Specifies whether the page is an Atom feed.
    #
    def atom?
      (content_type =~ /application\/atom\+xml/) == 0
    end

    #
    # Determines if the page is a MS Word document.
    #
    # @return [Boolean]
    #   Specifies whether the page is a MS Word document.
    #
    def ms_word?
      (content_type =~ /application\/msword/) == 0
    end

    #
    # Determines if the page is a PDF document.
    #
    # @return [Boolean]
    #   Specifies whether the page is a PDF document.
    #
    def pdf?
      (content_type =~ /application\/pdf/) == 0
    end

    #
    # Determines if the page is a ZIP archive.
    #
    # @return [Boolean]
    #   Specifies whether the page is a ZIP archive.
    #
    def zip?
      (content_type =~ /application\/zip/) == 0
    end

    #
    # The body of the response.
    #
    # @return [String]
    #   The body of the response.
    #
    def body
      @response.body
    end

    #
    # Returns a parsed document object for HTML, XML, RSS and Atom pages.
    #
    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
    #   The document that represents HTML or XML pages.
    #   Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
    #   the page could not be parsed properly.
    #
    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
    #
    def doc
      return nil if (body.nil? || body.empty?)

      begin
        if html?
          return @doc ||= Nokogiri::HTML(body)
        elsif (xml? || xsl? || rss? || atom?)
          return @doc ||= Nokogiri::XML(body)
        end
      rescue
        return nil
      end
    end

    #
    # Searches the document for XPath or CSS Path paths.
    #
    # @param [Array<String>] paths
    #   CSS or XPath expressions to search the document with.
    #
    # @return [Array]
    #   The matched nodes from the document.
    #   Returns an empty Array if no nodes were matched, or if the page
    #   is not an HTML or XML document.
    #
    # @example
    #   page.search('//a[@href]')
    #
    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
    #
    def search(*paths)
      if doc
        return doc.search(*paths)
      end

      return []
    end

    #
    # Searches for the first occurrence an XPath or CSS Path expression.
    #
    # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
    #   The first matched node. Returns +nil+ if no nodes could be matched,
    #   or if the page is not a HTML or XML document.
    #
    # @example
    #   page.at('//title')
    #
    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
    #
    def at(*arguments)
      if doc
        return doc.at(*arguments)
      end

      return nil
    end

    alias / search
    alias % at

    #
    # The title of the HTML page.
    #
    # @return [String]
    #   The inner-text of the title element of the page.
    #
    def title
      if (node = at('//title'))
        return node.inner_text
      end
    end

    #
    # The links from within the page.
    #
    # @return [Array<String>]
    #   All links within the HTML page, frame/iframe source URLs and any
    #   links in the +Location+ header.
    #
    def links
      urls = []

      add_url = lambda { |url|
        urls << url unless (url.nil? || url.empty?)
      }

      case code
      when 300..303, 307
        location = @headers['location']

        if location.kind_of?(Array)
          # handle multiple location URLs
          location.each(&add_url)
        else
          # usually the location header contains a single String
          add_url.call(location)
        end
      end

      if (html? && doc)
        doc.search('a[@href]').each do |a|
          add_url.call(a.get_attribute('href'))
        end

        doc.search('frame[@src]').each do |iframe|
          add_url.call(iframe.get_attribute('src'))
        end

        doc.search('iframe[@src]').each do |iframe|
          add_url.call(iframe.get_attribute('src'))
        end

        doc.search('link[@href]').each do |link|
          add_url.call(link.get_attribute('href'))
        end

        doc.search('script[@src]').each do |script|
          add_url.call(script.get_attribute('src'))
        end
      end

      return urls
    end

    #
    # Absolute URIs from within the page.
    #
    # @return [Array<URI::HTTP>]
    #   The links from within the page, converted to absolute URIs.
    #
    def urls
      links.map { |link| to_absolute(link) }.compact
    end

    #
    # Normalizes and expands a given link into a proper URI.
    #
    # @param [String] link
    #   The link to normalize and expand.
    #
    # @return [URI::HTTP]
    #   The normalized URI.
    #
    def to_absolute(link)
      begin
        url = @url.merge(link.to_s)
      rescue URI::InvalidURIError
        return nil
      end

      unless (url.path.nil? || url.path.empty?)
        # make sure the path does not contain any .. or . directories,
        # since URI::Generic#merge cannot normalize paths such as
        # "/stuff/../"
        url.path = URI.expand_path(url.path)
      end

      return url
    end

    protected

    #
    # Provides transparent access to the values in +headers+.
    #
    def method_missing(sym,*args,&block)
      if (args.empty? && block.nil?)
        name = sym.id2name.sub('_','-')

        return @response[name] if @response.key?(name)
      end

      return super(sym,*args,&block)
    end

  end
end