lib/anemone/core.rb in anemone-0.1.2 vs lib/anemone/core.rb in anemone-0.2.0
- old
+ new
@@ -21,10 +21,14 @@
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
+ if Anemone.options.obey_robots_txt
+ @robots = Robots.new(Anemone.options.user_agent)
+ end
+
block.call(self) if block
end
#
# Convenience method to start a new crawl
@@ -111,22 +115,22 @@
@pages[page.url] = page
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
- #perform the on_every_page blocks for this page
+ # perform the on_every_page blocks for this page
do_page_blocks(page)
page.doc = nil if Anemone.options.discard_page_bodies
links_to_follow(page).each do |link|
- link_queue.enq(link)
+ link_queue.enq([link, page])
@pages[link] = nil
end
- #create an entry in the page hash for each alias of this page,
- #i.e. all the pages that redirected to this page
+ # create an entry in the page hash for each alias of this page,
+ # i.e. all the pages that redirected to this page
page.aliases.each do |aka|
if !@pages.has_key?(aka) or @pages[aka].nil?
@pages[aka] = page.alias_clone(aka)
end
@pages[aka].add_alias!(page.url)
@@ -182,19 +186,29 @@
# Based on whether or not the link has already been crawled,
# and the block given to focus_crawl()
#
def links_to_follow(page)
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
- links.find_all { |link| visit_link?(link) }
+ links.select { |link| visit_link?(link, page) }
end
#
# Returns +true+ if *link* has not been visited already,
- # and is not excluded by a skip_link pattern. Returns
- # +false+ otherwise.
+ # and is not excluded by a skip_link pattern...
+ # and is not excluded by robots.txt...
+ # and is not deeper than the depth limit
+ # Returns +false+ otherwise.
#
- def visit_link?(link)
- !@pages.has_key?(link) and !skip_link?(link)
+ def visit_link?(link, from_page = nil)
+ allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+
+ if from_page
+ too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+ else
+ too_deep = false
+ end
+
+ !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
end
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.