lib/anemone/core.rb in spk-anemone-0.3.1 vs lib/anemone/core.rb in spk-anemone-0.4.0

- old
+ new

@@ -5,11 +5,11 @@ require 'anemone/page_store' require 'anemone/storage' module Anemone - VERSION = '0.3.1'; + VERSION = '0.4.0'; # # Convenience method to start a crawl # def Anemone.crawl(urls, options = {}, &block) @@ -40,18 +40,22 @@ :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, # storage engine defaults to Hash in +process_options+ if none specified :storage => nil, + # Hash of cookie name => value to send with HTTP requests + :cookies => nil, + # accept cookies from the server and send them back? + :accept_cookies => false, # Authentication :authorization => nil, } # Create setter methods for all options to be called from the crawl block DEFAULT_OPTS.keys.each do |key| - define_method "#{key}=" do |*args| - @opts[key.to_sym] = *args + define_method "#{key}=" do |value| + @opts[key.to_sym] = value end end # # Initialize the crawl with starting *urls* (single URL or Array of URLs) @@ -176,11 +180,11 @@ break end end end - @tentacles.each { |t| t.join } + @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end private @@ -189,12 +193,24 @@ @opts = DEFAULT_OPTS.merge @opts authorization(@opts[:authorization]) if @opts[:authorization] @opts[:threads] = 1 if @opts[:delay] > 0 @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash) @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt] + + freeze_options end + # + # Freeze the opts Hash so that no options can be modified + # once the crawl begins + # + def freeze_options + @opts.freeze + @opts.each_key { |key| @opts[key].freeze } + @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil + end + # Generate Authorization string and set authorization opts def authorization(auth=nil) require 'base64' if auth.is_a?(String) && auth.include?(':') self.authorization = "Basic #{Base64.b64encode(auth)}" @@ -211,23 +227,23 @@ # # Execute the after_crawl blocks # def do_after_crawl_blocks - @after_crawl_blocks.each { |b| b.call(@pages) } + @after_crawl_blocks.each { |block| block.call(@pages) } end # # Execute the on_every_page blocks for *page* # def do_page_blocks(page) - @on_every_page_blocks.each do |blk| - blk.call(page) + @on_every_page_blocks.each do |block| + block.call(page) end - @on_pages_like_blocks.each do |pattern, blks| - blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern + @on_pages_like_blocks.each do |pattern, blocks| + blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern end end # # Return an Array of links to follow from the given page. @@ -261,10 +277,10 @@ # # Returns +true+ if *link* should not be visited because # its URL matches a skip_link pattern. # def skip_link?(link) - @skip_link_patterns.any? { |p| link.path =~ p } + @skip_link_patterns.any? { |pattern| link.path =~ pattern } end end end