require 'mechanize'

# Enables the spidering of websites by utilizing Mechanize
class Crawler < Mechanize

  def initialize
    @mech = Mechanize.new
    @mech.max_history = nil
  end


  # Kicks off the spidering of a site.
  # @param [String] A simple URL string to crawl.
  # @return [Hash] A hash of URls crawled.
  #
  def spiderize(url)
    page = @mech.get(url)
    stack = page.links
    stack.push(*src_links(page))

    while link = stack.pop
      next if reject(link)
      puts "crawling #{link.uri}"
      begin
        page = link.click
        next unless Mechanize::Page === page
        stack.push(*src_links(page))
        stack.push(*page.links)
      rescue Mechanize::ResponseCodeError
      end
    end
    return @mech.history
  end


  # Since mechanize doesn't treat src elements as links, this will
  # return all src links from a page.
  # @param [Mechanize::Page] A mechanize page object.
  # @return [Array] An array of created Mechanize::Page::Link objects.
  #
  def src_links(page)
    links = []
    page.search('script').each do |element|
      next if element.attributes['src'].nil?
      doc = Nokogiri::HTML::Document.new
      node = Nokogiri::XML::Node.new('foo', doc)
      node['href'] = element.attributes['src'].value
      link = Mechanize::Page::Link.new(node, @mech, page)
      links.push(link)
    end
    return links
  end


  # Whether we should reject to spider a URL.
  # @param [Mechanize::Page::Link] A mechanize page link.
  # @return [Boolean] true if we should reject URL.
  #
  def reject(link)
    # TODO: are we accounting for subdomains?
    if not_valid_uri?(link) || not_same_domain?(link) || already_spidered?(link)
      return true
    else
      return false
    end
  end


  # Checks whether a link has already been crawled.
  # @param [Mechanize::Page::Link] A mechanize page link.
  # @return [Booolean] true when already spidered.
  #
  def already_spidered?(link)
    abs_url = @mech.history.first.uri.to_s.chomp('/') + link.href + '/'
    return true if (@mech.visited? link.href) || (@mech.visited? abs_url)
  end


  # Checks whether a URL is able to be crawled.
  # @param [Mechanize::Page::Link] A mechanize page link.
  # @return [Booolean] true when valid URL.
  #
  def not_valid_uri?(link)
    if link.uri
      return true unless /^http.+/ =~ link.uri.to_s || /\/.+/ =~ link.uri.to_s
    else
      return true
    end
  end


  # Checks whether a URL is from the same domain.
  # @param [Mechanize::Page::Link] A mechanize page link.
  # @return [Booolean] true when not the same domain as original URL.
  #
  def not_same_domain?(link)
    host = link.uri.host
    return true unless host.nil? || host == @mech.history.first.uri.host
  end

  private :not_valid_uri?, :not_same_domain?, :already_spidered?
end