core.rb in anemone-0.2.3

- old
+ new

@@ -1,43 +1,73 @@
-require 'net/http'
 require 'thread'
+require 'robots'
 require 'anemone/tentacle'
 require 'anemone/page'
 require 'anemone/page_hash'
 
 module Anemone
+
+  VERSION = '0.2.3';
+
+  #
+  # Convenience method to start a crawl
+  #
+  def Anemone.crawl(urls, options = {}, &block)
+    Core.crawl(urls, options, &block)
+  end  
+
   class Core
     # PageHash storing all Page objects encountered during the crawl
     attr_reader :pages
-    
+
+    # Hash of options for the crawl
+    attr_accessor :opts
+
+    DEFAULT_OPTS = {
+      # run 4 Tentacle threads to fetch pages
+      :threads => 4,
+      # disable verbose output
+      :verbose => false,
+      # don't throw away the page response body after scanning it for links
+      :discard_page_bodies => false,
+      # identify self as Anemone/VERSION
+      :user_agent => "Anemone/#{Anemone::VERSION}",
+      # no delay between requests
+      :delay => 0,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false,
+      # by default, don't limit the depth of the crawl
+      :depth_limit => false,
+      # number of times HTTP redirects will be followed
+      :redirect_limit => 5
+    }
+
     #
     # Initialize the crawl with starting *urls* (single URL or Array of URLs)
     # and optional *block*
     #
-    def initialize(urls)
+    def initialize(urls, opts = {})
       @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
       @urls.each{ |url| url.path = '/' if url.path.empty? }
 
       @tentacles = []
       @pages = PageHash.new
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
-      
-      if Anemone.options.obey_robots_txt
-        @robots = Robots.new(Anemone.options.user_agent)
-      end
 
+      process_options opts
+
       yield self if block_given?
     end
     
     #
     # Convenience method to start a new crawl
     #
-    def self.crawl(root)
-      self.new(root) do |core|
+    def self.crawl(urls, opts = {})
+      self.new(urls, opts) do |core|
         yield core if block_given?
         core.run
       end
     end
     
@@ -53,15 +83,11 @@
     #
     # Add one ore more Regex patterns for URLs which should not be
     # followed
     #
     def skip_links_like(*patterns)
-      if patterns
-        patterns.each do |pattern|
-          @skip_link_patterns << pattern
-        end
-      end
+      @skip_link_patterns.concat [patterns].flatten.compact
       self
     end
     
     #
     # Add a block to be executed on every Page as they are encountered
@@ -102,27 +128,27 @@
       return if @urls.empty?
       
       link_queue = Queue.new
       page_queue = Queue.new
 
-      Anemone.options.threads.times do
-        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+      @opts[:threads].times do
+        @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
       end
       
       @urls.each{ |url| link_queue.enq(url) }
 
       loop do
         page = page_queue.deq
         
         @pages[page.url] = page
         
-        puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+        puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
         
         # perform the on_every_page blocks for this page
         do_page_blocks(page)
 
-        page.discard_doc! if Anemone.options.discard_page_bodies
+        page.discard_doc! if @opts[:discard_page_bodies]
         
         links_to_follow(page).each do |link|
           link_queue.enq([link, page])
           @pages[link] = nil
         end
@@ -156,11 +182,19 @@
       
       self
     end
     
     private    
-    
+
+    def process_options(options)
+      @opts = DEFAULT_OPTS.merge options
+
+      @opts[:threads] = 1 if @opts[:delay] > 0
+
+      @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+    end
+
     #
     # Execute the after_crawl blocks
     #
     def do_after_crawl_blocks
       @after_crawl_blocks.each {|b| b.call(@pages)}
@@ -197,14 +231,14 @@
     # and is not excluded by robots.txt...
     # and is not deeper than the depth limit
     # Returns +false+ otherwise.
     #
     def visit_link?(link, from_page = nil)
-      allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+      allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
       
-      if from_page
-        too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+      if from_page && @opts[:depth_limit]
+        too_deep = from_page.depth >= @opts[:depth_limit]
       else
         too_deep = false
       end
       
       !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
@@ -213,11 +247,10 @@
     #
     # Returns +true+ if *link* should not be visited because
     # its URL matches a skip_link pattern.
     #
     def skip_link?(link)
-      @skip_link_patterns.each { |p| return true if link.path =~ p}
-      false
+      @skip_link_patterns.any? { |p| link.path =~ p }
     end
     
   end
 end