lib/anemone/core.rb in anemone-0.1.2 vs lib/anemone/core.rb in anemone-0.2.0

- old
+ new

@@ -21,10 +21,14 @@ @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] + if Anemone.options.obey_robots_txt + @robots = Robots.new(Anemone.options.user_agent) + end + block.call(self) if block end # # Convenience method to start a new crawl @@ -111,22 +115,22 @@ @pages[page.url] = page puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose - #perform the on_every_page blocks for this page + # perform the on_every_page blocks for this page do_page_blocks(page) page.doc = nil if Anemone.options.discard_page_bodies links_to_follow(page).each do |link| - link_queue.enq(link) + link_queue.enq([link, page]) @pages[link] = nil end - #create an entry in the page hash for each alias of this page, - #i.e. all the pages that redirected to this page + # create an entry in the page hash for each alias of this page, + # i.e. all the pages that redirected to this page page.aliases.each do |aka| if !@pages.has_key?(aka) or @pages[aka].nil? @pages[aka] = page.alias_clone(aka) end @pages[aka].add_alias!(page.url) @@ -182,19 +186,29 @@ # Based on whether or not the link has already been crawled, # and the block given to focus_crawl() # def links_to_follow(page) links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links - links.find_all { |link| visit_link?(link) } + links.select { |link| visit_link?(link, page) } end # # Returns +true+ if *link* has not been visited already, - # and is not excluded by a skip_link pattern. Returns - # +false+ otherwise. + # and is not excluded by a skip_link pattern... + # and is not excluded by robots.txt... + # and is not deeper than the depth limit + # Returns +false+ otherwise. # - def visit_link?(link) - !@pages.has_key?(link) and !skip_link?(link) + def visit_link?(link, from_page = nil) + allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true + + if from_page + too_deep = from_page.depth >= Anemone.options.depth_limit rescue false + else + too_deep = false + end + + !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep end # # Returns +true+ if *link* should not be visited because # its URL matches a skip_link pattern.