require 'net/http'
require 'thread'
require 'anemone/tentacle'
require 'anemone/page_hash'

module Anemone
  class Core
    # PageHash storing all Page objects encountered during the crawl
    attr_reader :pages
    
    #
    # Initialize the crawl with a starting *url*, *options*, and optional *block*
    #
    def initialize(url, &block)
      url = URI(url) if url.is_a?(String)
      @url = url
      @tentacles = []
      @pages = PageHash.new
      @on_every_page_blocks = []
      @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
      @skip_link_patterns = []
      @after_crawl_blocks = []
      
      block.call(self) if block
    end
    
    #
    # Convenience method to start a new crawl
    #
    def self.crawl(root, &block)
      self.new(root) do |core|
        block.call(core) if block
        core.run
        core.do_after_crawl_blocks
        return core
      end
    end
    
    #
    # Add a block to be executed on the PageHash after the crawl
    # is finished
    #
    def after_crawl(&block)
      @after_crawl_blocks << block
      self
    end
    
    #
    # Add one ore more Regex patterns for URLs which should not be
    # followed
    #
    def skip_links_like(*patterns)
      if patterns
        patterns.each do |pattern|
          @skip_link_patterns << pattern
        end
      end
      self
    end
    
    #
    # Add a block to be executed on every Page as they are encountered
    # during the crawl
    #
    def on_every_page(&block)
      @on_every_page_blocks << block
      self
    end
    
    #
    # Add a block to be executed on Page objects with a URL matching
    # one or more patterns
    #
    def on_pages_like(*patterns, &block)
      if patterns
        patterns.each do |pattern|
          @on_pages_like_blocks[pattern] << block
        end
      end
      self
    end
    
    #
    # Perform the crawl
    #
    def run
      link_queue = Queue.new
      page_queue = Queue.new

      Anemone.options.threads.times do |id|
        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
      end
      
      return if !visit_link?(@url)
      
      link_queue.enq(@url)

      while true do
        page = page_queue.deq
        
        @pages[page.url] = page
        
        puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
        
        do_page_blocks(page)

        page.doc = nil if Anemone.options.discard_page_bodies
        
        page.links.each do |link| 
          if visit_link?(link)
            link_queue.enq(link)
            @pages[link] = nil
          end
        end
        
        page.aliases.each do |aka|
          if !@pages.has_key?(aka) or @pages[aka].nil?
            @pages[aka] = page.alias_clone(aka)
          end
          @pages[aka].add_alias!(page.url)
        end
        
        # if we are done with the crawl, tell the threads to end
        if link_queue.empty? and page_queue.empty?
          until link_queue.num_waiting == @tentacles.size
            Thread.pass
          end
          
          if page_queue.empty?
            @tentacles.size.times { |i| link_queue.enq(:END)}
            break
          end
        end
        
      end

      @tentacles.each { |t| t.join }

      self
    end
    
    #
    # Execute the after_crawl blocks
    #
    def do_after_crawl_blocks
      @after_crawl_blocks.each {|b| b.call(@pages)}
    end
    
    #
    # Execute the on_every_page blocks for *page*
    #
    def do_page_blocks(page)
      @on_every_page_blocks.each do |blk|
        blk.call(page)
      end
      
      @on_pages_like_blocks.each do |pattern, blks|
        if page.url.to_s =~ pattern
          blks.each { |blk| blk.call(page) }
        end
      end
    end      
    
    #
    # Returns +true+ if *link* has not been visited already,
    # and is not excluded by a skip_link pattern. Returns
    # +false+ otherwise.
    #
    def visit_link?(link)
      !@pages.has_key?(link) and !skip_link?(link)
    end
    
    #
    # Returns +true+ if *link* should not be visited because
    # its URL matches a skip_link pattern.
    #
    def skip_link?(link)
      @skip_link_patterns.each { |p| return true if link.path =~ p}
      return false
    end
    
  end
end