require 'nokogiri'

module Spidercrawl
  # Parses the content with Nokogiri
  class Page

    attr_reader :location, :response_time
    attr_accessor :crawled_time

    def initialize(url, options = {})
      @url = url
      @code = options[:response_code]
      @headers = options[:response_head]
      @location = options[:redirect_url]
      @body = options[:response_body]
      @response_time = options[:response_time]
      @crawled_time = options[:crawled_time]
    end

    #
    # Return the url of the page
    #
    def url
      @url.to_s
    end

    #
    # Return the url scheme of the page (e.g. http, https, etc.)
    #
    def scheme
      @url.scheme
    end

    #
    # Return the url host of the page
    #
    def host
      @url.host
    end

    #
    # Return the base url of the page
    #
    def base_url
      @base_url = "#{scheme}://#{host}"
    end

    #
    # Return the Nokogiri html document
    #
    def doc
      @document = Nokogiri::HTML(@body)
      rescue Exception => e
        puts e.inspect
        puts e.backtrace
    end

    #
    # Return the headers of the page
    #
    def headers
      puts @headers
    end
    
    #
    # Return the title of the page
    #
    def title
      @title = doc.css('head title').inner_text
    end

    #
    # Return the entire links found in the page; exclude empty links
    #
    def links
      @links = doc.css('a').map { |link| link['href'].to_s }.uniq.delete_if { |href| href.empty? }.map { |url| absolutify(url.strip) }
    end

    #
    # Return the internal links found in the page
    #
    def internal_links
      @internal_links = links.select { |link| URI.parse(link).host == host } rescue nil
    end

    #
    # Return the external links found in the page
    #
    def external_links
      @external_links = links.select { |link| URI.parse(link).host != host } rescue nil
    end

    #
    # Return any emails found in the page
    #
    def emails
      @body.match(/[\w.!#\$%+-]+@[\w-]+(?:\.[\w-]+)+/)
    end

    #
    # Return all images found in the page
    #
    def images
      @images = doc.css('img').map { |img| img['src'].to_s }.uniq.delete_if { |src| src.empty? }.map { |url| absolutify(url.strip) }
    end

    #
    # Return all words found in the page
    #
    def words
      @words = text.split(/[^a-zA-Z]/).delete_if { |word| word.empty? }
    end

    #
    # Return css scripts of the page
    #
    def css
      @css = doc.search("[@type='text/css']")
    end

    def meta_keywords
    end

    def meta_descriptions
    end

    #
    # Return html content as a string
    #
    def content
      @body.to_s
    end

    #
    # Return the content type of the page
    #
    def content_type
      doc.at("meta[@http-equiv='Content-Type']")['content']
    end

    # 
    # Return plain text of the page without html tags
    #
    def text
      temp_doc = doc
      temp_doc.css('script, noscript, style, link').each { |node| node.remove }
      @text = temp_doc.css('body').text.split("\n").collect { |line| line.strip }.join("\n")
    end

    #
    # Return the response code
    #
    def response_code
      @code
    end

    #
    # Return true if page not found 
    #
    def not_found?
      @code == 404
    end

    #
    # Return true if page is fetched successfully
    #
    def success?
      @code == 200
    end

    #
    # Return true if page is redirected
    #
    def redirect?
      (300..307).include?(@code)
    end

    #
    # Return the absolute url
    #
    private
    def absolutify(page_url)
      return URI.escape(page_url) if page_url =~ /^\w*\:/i
      return base_url + URI.escape(page_url)
    end
  end
end