lib/anemone/core.rb in spk-anemone-0.2.4 vs lib/anemone/core.rb in spk-anemone-0.3.0

- old
+ new

@@ -1,28 +1,29 @@ require 'thread' require 'robots' require 'anemone/tentacle' require 'anemone/page' -require 'anemone/page_hash' +require 'anemone/page_store' +require 'anemone/storage' module Anemone - VERSION = '0.2.4'; + VERSION = '0.3.0'; # # Convenience method to start a crawl # def Anemone.crawl(urls, options = {}, &block) Core.crawl(urls, options, &block) end class Core - # PageHash storing all Page objects encountered during the crawl - attr_reader :pages + # PageStore storing all Page objects encountered during the crawl + attr_reader :pages # Hash of options for the crawl - attr_accessor :opts + attr_reader :opts DEFAULT_OPTS = { # run 4 Tentacle threads to fetch pages :threads => 4, # disable verbose output @@ -37,33 +38,37 @@ :obey_robots_txt => false, # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, + # storage engine defaults to Hash in +process_options+ if none specified + :storage => nil, # Authentication :authorization => nil, } + # Create setter methods for all options to be called from the crawl block + DEFAULT_OPTS.keys.each do |key| + define_method "#{key}=" do |*args| + @opts[key.to_sym] = *args + end + end + # # Initialize the crawl with starting *urls* (single URL or Array of URLs) # and optional *block* # def initialize(urls, opts = {}) - process_options opts - @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } - @urls.each{ |url| - url.path = '/' if url.path.empty? - authorization(url) if url.user - } + @urls.each{ |url| url.path = '/' if url.path.empty? } @tentacles = [] - @pages = PageHash.new @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] + @opts = opts yield self if block_given? end # @@ -75,11 +80,11 @@ core.run end end # - # Add a block to be executed on the PageHash after the crawl + # Add a block to be executed on the PageStore after the crawl # is finished # def after_crawl(&block) @after_crawl_blocks << block self @@ -127,103 +132,90 @@ # # Perform the crawl # def run + process_options + @urls.delete_if { |url| !visit_link?(url) } return if @urls.empty? link_queue = Queue.new page_queue = Queue.new @opts[:threads].times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run } end - @urls.each{ |url| link_queue.enq(url) } + @urls.each{ |url| + link_queue.enq(url) + authorization(url) if url.user + } loop do page = page_queue.deq - - @pages[page.url] = page - + @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] - - # perform the on_every_page blocks for this page - do_page_blocks(page) - + do_page_blocks page page.discard_doc! if @opts[:discard_page_bodies] - links_to_follow(page).each do |link| - link_queue.enq([link, page]) - @pages[link] = nil + links = links_to_follow page + links.each do |link| + link_queue << [link, page.url.dup, page.depth + 1] end + @pages.touch_keys links - # create an entry in the page hash for each alias of this page, - # i.e. all the pages that redirected to this page - page.aliases.each do |aka| - if !@pages.has_key?(aka) or @pages[aka].nil? - @pages[aka] = page.alias_clone(aka) - end - @pages[aka].add_alias!(page.url) - end + @pages[page.url] = page # if we are done with the crawl, tell the threads to end if link_queue.empty? and page_queue.empty? until link_queue.num_waiting == @tentacles.size Thread.pass end - if page_queue.empty? - @tentacles.size.times { link_queue.enq(:END)} + @tentacles.size.times { link_queue << :END } break end end - end @tentacles.each { |t| t.join } - - do_after_crawl_blocks() - + do_after_crawl_blocks self end private - def process_options(options) - @opts = DEFAULT_OPTS.merge options - - authorization(@opts[:authorization]) - + def process_options + @opts = DEFAULT_OPTS.merge @opts + authorization(@opts[:authorization]) if @opts[:authorization] @opts[:threads] = 1 if @opts[:delay] > 0 - + @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash) @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt] end # Generate Authorization string only if not already set def authorization(auth=nil) - return if @opts[:authorization] =~ /^Basic .*/ require 'base64' if auth.is_a?(String) && auth.include?(':') - @opts[:authorization] = "Basic #{Base64.b64encode(auth)}" + self.authorization = "Basic #{Base64.b64encode(auth)}" elsif auth.is_a?(Array) user = auth.first password = auth.last - @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}" + self.authorization = "Basic #{Base64.b64encode(user+":"+password)}" elsif auth.is_a?(URI) user = auth.user password = auth.password - @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}" + self.authorization = "Basic #{Base64.b64encode(user+":"+password)}" end end # # Execute the after_crawl blocks # def do_after_crawl_blocks - @after_crawl_blocks.each {|b| b.call(@pages)} + @after_crawl_blocks.each { |b| b.call(@pages) } end # # Execute the on_every_page blocks for *page* # @@ -231,23 +223,21 @@ @on_every_page_blocks.each do |blk| blk.call(page) end @on_pages_like_blocks.each do |pattern, blks| - if page.url.to_s =~ pattern - blks.each { |blk| blk.call(page) } - end + blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern end end # # Return an Array of links to follow from the given page. # Based on whether or not the link has already been crawled, # and the block given to focus_crawl() # def links_to_follow(page) links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links - links.select { |link| visit_link?(link, page) } + links.select { |link| visit_link?(link, page) }.map { |link| link.dup } end # # Returns +true+ if *link* has not been visited already, # and is not excluded by a skip_link pattern...