core.rb in anemone-0.2.0

- old
+ new

@@ -21,10 +21,14 @@
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
       @after_crawl_blocks = []
       
+      if Anemone.options.obey_robots_txt
+        @robots = Robots.new(Anemone.options.user_agent)
+      end
+      
       block.call(self) if block
     end
     
     #
     # Convenience method to start a new crawl
@@ -111,22 +115,22 @@
         
         @pages[page.url] = page
         
         puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
         
-        #perform the on_every_page blocks for this page
+        # perform the on_every_page blocks for this page
         do_page_blocks(page)
 
         page.doc = nil if Anemone.options.discard_page_bodies
         
         links_to_follow(page).each do |link|
-          link_queue.enq(link)
+          link_queue.enq([link, page])
           @pages[link] = nil
         end
         
-        #create an entry in the page hash for each alias of this page,
-        #i.e. all the pages that redirected to this page
+        # create an entry in the page hash for each alias of this page,
+        # i.e. all the pages that redirected to this page
         page.aliases.each do |aka|
           if !@pages.has_key?(aka) or @pages[aka].nil?
             @pages[aka] = page.alias_clone(aka)
           end
           @pages[aka].add_alias!(page.url)
@@ -182,19 +186,29 @@
     # Based on whether or not the link has already been crawled,
     # and the block given to focus_crawl()
     #
     def links_to_follow(page)
       links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
-      links.find_all { |link| visit_link?(link) }
+      links.select { |link| visit_link?(link, page) }
     end
     
     #
     # Returns +true+ if *link* has not been visited already,
-    # and is not excluded by a skip_link pattern. Returns
-    # +false+ otherwise.
+    # and is not excluded by a skip_link pattern...
+    # and is not excluded by robots.txt...
+    # and is not deeper than the depth limit
+    # Returns +false+ otherwise.
     #
-    def visit_link?(link)
-      !@pages.has_key?(link) and !skip_link?(link)
+    def visit_link?(link, from_page = nil)
+      allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+      
+      if from_page
+        too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+      else
+        too_deep = false
+      end
+      
+      !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
     end
     
     #
     # Returns +true+ if *link* should not be visited because
     # its URL matches a skip_link pattern.