require 'thread'
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/page_hash'

module Anemone

  VERSION = '0.2.4';

  #
  # Convenience method to start a crawl
  #
  def Anemone.crawl(urls, options = {}, &block)
    Core.crawl(urls, options, &block)
  end

  class Core
    # PageHash storing all Page objects encountered during the crawl
    attr_reader :pages

    # Hash of options for the crawl
    attr_accessor :opts

    DEFAULT_OPTS = {
      # run 4 Tentacle threads to fetch pages
      :threads => 4,
      # disable verbose output
      :verbose => false,
      # don't throw away the page response body after scanning it for links
      :discard_page_bodies => false,
      # identify self as Anemone/VERSION
      :user_agent => "Anemone/#{Anemone::VERSION}",
      # no delay between requests
      :delay => 0,
      # don't obey the robots exclusion protocol
      :obey_robots_txt => false,
      # by default, don't limit the depth of the crawl
      :depth_limit => false,
      # number of times HTTP redirects will be followed
      :redirect_limit => 5,
      # Authentication
      :authorization => nil,
    }

    #
    # Initialize the crawl with starting *urls* (single URL or Array of URLs)
    # and optional *block*
    #
    def initialize(urls, opts = {})
      process_options opts

      @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
      @urls.each{ |url|
        url.path = '/' if url.path.empty?
        authorization(url) if url.user
      }

      @tentacles = []
      @pages = PageHash.new
      @on_every_page_blocks = []
      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
      @skip_link_patterns = []
      @after_crawl_blocks = []

      yield self if block_given?
    end

    #
    # Convenience method to start a new crawl
    #
    def self.crawl(urls, opts = {})
      self.new(urls, opts) do |core|
        yield core if block_given?
        core.run
      end
    end

    #
    # Add a block to be executed on the PageHash after the crawl
    # is finished
    #
    def after_crawl(&block)
      @after_crawl_blocks << block
      self
    end

    #
    # Add one ore more Regex patterns for URLs which should not be
    # followed
    #
    def skip_links_like(*patterns)
      @skip_link_patterns.concat [patterns].flatten.compact
      self
    end

    #
    # Add a block to be executed on every Page as they are encountered
    # during the crawl
    #
    def on_every_page(&block)
      @on_every_page_blocks << block
      self
    end

    #
    # Add a block to be executed on Page objects with a URL matching
    # one or more patterns
    #
    def on_pages_like(*patterns, &block)
      if patterns
        patterns.each do |pattern|
          @on_pages_like_blocks[pattern] << block
        end
      end
      self
    end

    #
    # Specify a block which will select which links to follow on each page.
    # The block should return an Array of URI objects.
    #
    def focus_crawl(&block)
      @focus_crawl_block = block
      self
    end

    #
    # Perform the crawl
    #
    def run
      @urls.delete_if { |url| !visit_link?(url) }
      return if @urls.empty?

      link_queue = Queue.new
      page_queue = Queue.new

      @opts[:threads].times do
        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
      end

      @urls.each{ |url| link_queue.enq(url) }

      loop do
        page = page_queue.deq

        @pages[page.url] = page

        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]

        # perform the on_every_page blocks for this page
        do_page_blocks(page)

        page.discard_doc! if @opts[:discard_page_bodies]

        links_to_follow(page).each do |link|
          link_queue.enq([link, page])
          @pages[link] = nil
        end

        # create an entry in the page hash for each alias of this page,
        # i.e. all the pages that redirected to this page
        page.aliases.each do |aka|
          if !@pages.has_key?(aka) or @pages[aka].nil?
            @pages[aka] = page.alias_clone(aka)
          end
          @pages[aka].add_alias!(page.url)
        end

        # if we are done with the crawl, tell the threads to end
        if link_queue.empty? and page_queue.empty?
          until link_queue.num_waiting == @tentacles.size
            Thread.pass
          end

          if page_queue.empty?
            @tentacles.size.times { link_queue.enq(:END)}
            break
          end
        end

      end

      @tentacles.each { |t| t.join }

      do_after_crawl_blocks()

      self
    end

    private

    def process_options(options)
      @opts = DEFAULT_OPTS.merge options

      authorization(@opts[:authorization])

      @opts[:threads] = 1 if @opts[:delay] > 0

      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
    end

    # Generate Authorization string only if not already set
    def authorization(auth=nil)
      return if @opts[:authorization] =~ /^Basic .*/
      require 'base64'
      if auth.is_a?(String) && auth.include?(':')
        @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
      elsif auth.is_a?(Array)
        user = auth.first
        password = auth.last
        @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
      elsif auth.is_a?(URI)
        user = auth.user
        password = auth.password
        @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
      end
    end

    #
    # Execute the after_crawl blocks
    #
    def do_after_crawl_blocks
      @after_crawl_blocks.each {|b| b.call(@pages)}
    end

    #
    # Execute the on_every_page blocks for *page*
    #
    def do_page_blocks(page)
      @on_every_page_blocks.each do |blk|
        blk.call(page)
      end

      @on_pages_like_blocks.each do |pattern, blks|
        if page.url.to_s =~ pattern
          blks.each { |blk| blk.call(page) }
        end
      end
    end

    #
    # Return an Array of links to follow from the given page.
    # Based on whether or not the link has already been crawled,
    # and the block given to focus_crawl()
    #
    def links_to_follow(page)
      links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
      links.select { |link| visit_link?(link, page) }
    end

    #
    # Returns +true+ if *link* has not been visited already,
    # and is not excluded by a skip_link pattern...
    # and is not excluded by robots.txt...
    # and is not deeper than the depth limit
    # Returns +false+ otherwise.
    #
    def visit_link?(link, from_page = nil)
      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true

      if from_page && @opts[:depth_limit]
        too_deep = from_page.depth >= @opts[:depth_limit]
      else
        too_deep = false
      end

      !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
    end

    #
    # Returns +true+ if *link* should not be visited because
    # its URL matches a skip_link pattern.
    #
    def skip_link?(link)
      @skip_link_patterns.any? { |p| link.path =~ p }
    end

  end
end