lib/anemone/core.rb in anemone-0.3.2 vs lib/anemone/core.rb in anemone-0.4.0

- old
+ new

@@ -5,11 +5,11 @@ require 'anemone/page_store' require 'anemone/storage' module Anemone - VERSION = '0.3.2'; + VERSION = '0.4.0'; # # Convenience method to start a crawl # def Anemone.crawl(urls, options = {}, &block) @@ -39,17 +39,21 @@ # by default, don't limit the depth of the crawl :depth_limit => false, # number of times HTTP redirects will be followed :redirect_limit => 5, # storage engine defaults to Hash in +process_options+ if none specified - :storage => nil + :storage => nil, + # Hash of cookie name => value to send with HTTP requests + :cookies => nil, + # accept cookies from the server and send them back? + :accept_cookies => false } # Create setter methods for all options to be called from the crawl block DEFAULT_OPTS.keys.each do |key| - define_method "#{key}=" do |*args| - @opts[key.to_sym] = *args + define_method "#{key}=" do |value| + @opts[key.to_sym] = value end end # # Initialize the crawl with starting *urls* (single URL or Array of URLs) @@ -171,11 +175,11 @@ break end end end - @tentacles.each { |t| t.join } + @tentacles.each { |thread| thread.join } do_after_crawl_blocks self end private @@ -183,29 +187,41 @@ def process_options @opts = DEFAULT_OPTS.merge @opts @opts[:threads] = 1 if @opts[:delay] > 0 @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash) @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt] + + freeze_options end # + # Freeze the opts Hash so that no options can be modified + # once the crawl begins + # + def freeze_options + @opts.freeze + @opts.each_key { |key| @opts[key].freeze } + @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil + end + + # # Execute the after_crawl blocks # def do_after_crawl_blocks - @after_crawl_blocks.each { |b| b.call(@pages) } + @after_crawl_blocks.each { |block| block.call(@pages) } end # # Execute the on_every_page blocks for *page* # def do_page_blocks(page) - @on_every_page_blocks.each do |blk| - blk.call(page) + @on_every_page_blocks.each do |block| + block.call(page) end - @on_pages_like_blocks.each do |pattern, blks| - blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern + @on_pages_like_blocks.each do |pattern, blocks| + blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern end end # # Return an Array of links to follow from the given page. @@ -239,10 +255,10 @@ # # Returns +true+ if *link* should not be visited because # its URL matches a skip_link pattern. # def skip_link?(link) - @skip_link_patterns.any? { |p| link.path =~ p } + @skip_link_patterns.any? { |pattern| link.path =~ pattern } end end end