lib/anemone/core.rb in anemone-0.0.6 vs lib/anemone/core.rb in anemone-0.1.0

- old
+ new

@@ -7,16 +7,17 @@ class Core # PageHash storing all Page objects encountered during the crawl attr_reader :pages # - # Initialize the crawl with a starting *url*, *options*, and optional *block* + # Initialize the crawl with starting *urls* (single URL or Array of URLs) + # and optional *block* # - def initialize(url, &block) - url = URI(url) if url.is_a?(String) - @url = url - @url.path = "/" if @url.path.empty? + def initialize(urls, &block) + @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) } + @urls.each{ |url| url.path = '/' if url.path.empty? } + @tentacles = [] @pages = PageHash.new @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @@ -79,42 +80,53 @@ end self end # + # Specify a block which will select which links to follow on each page. + # The block should return an Array of URI objects. + # + def focus_crawl(&block) + @focus_crawl_block = block + self + end + + # # Perform the crawl # def run + @urls.delete_if { |url| !visit_link?(url) } + return if @urls.empty? + link_queue = Queue.new page_queue = Queue.new Anemone.options.threads.times do |id| @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run } end - return if !visit_link?(@url) - - link_queue.enq(@url) + @urls.each{ |url| link_queue.enq(url) } - while true do + loop do page = page_queue.deq @pages[page.url] = page puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose + #perform the on_every_page blocks for this page do_page_blocks(page) page.doc = nil if Anemone.options.discard_page_bodies - page.links.each do |link| - if visit_link?(link) - link_queue.enq(link) - @pages[link] = nil - end + links_to_follow(page).each do |link| + link_queue.enq(link) + @pages[link] = nil end + #create an entry in the page hash for each alias of this page, + #i.e. all the pages that redirected to this page page.aliases.each do |aka| if !@pages.has_key?(aka) or @pages[aka].nil? @pages[aka] = page.alias_clone(aka) end @pages[aka].add_alias!(page.url) @@ -162,9 +174,19 @@ if page.url.to_s =~ pattern blks.each { |blk| blk.call(page) } end end end + + # + # Return an Array of links to follow from the given page. + # Based on whether or not the link has already been crawled, + # and the block given to focus_crawl() + # + def links_to_follow(page) + links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links + links.find_all { |link| visit_link?(link) } + end # # Returns +true+ if *link* has not been visited already, # and is not excluded by a skip_link pattern. Returns # +false+ otherwise.