lib/anemone/core.rb in anemone-0.3.2 vs lib/anemone/core.rb in anemone-0.4.0
- old
+ new
@@ -5,11 +5,11 @@
require 'anemone/page_store'
require 'anemone/storage'
module Anemone
- VERSION = '0.3.2';
+ VERSION = '0.4.0';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
@@ -39,17 +39,21 @@
# by default, don't limit the depth of the crawl
:depth_limit => false,
# number of times HTTP redirects will be followed
:redirect_limit => 5,
# storage engine defaults to Hash in +process_options+ if none specified
- :storage => nil
+ :storage => nil,
+ # Hash of cookie name => value to send with HTTP requests
+ :cookies => nil,
+ # accept cookies from the server and send them back?
+ :accept_cookies => false
}
# Create setter methods for all options to be called from the crawl block
DEFAULT_OPTS.keys.each do |key|
- define_method "#{key}=" do |*args|
- @opts[key.to_sym] = *args
+ define_method "#{key}=" do |value|
+ @opts[key.to_sym] = value
end
end
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
@@ -171,11 +175,11 @@
break
end
end
end
- @tentacles.each { |t| t.join }
+ @tentacles.each { |thread| thread.join }
do_after_crawl_blocks
self
end
private
@@ -183,29 +187,41 @@
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+
+ freeze_options
end
#
+ # Freeze the opts Hash so that no options can be modified
+ # once the crawl begins
+ #
+ def freeze_options
+ @opts.freeze
+ @opts.each_key { |key| @opts[key].freeze }
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
+ end
+
+ #
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
- @after_crawl_blocks.each { |b| b.call(@pages) }
+ @after_crawl_blocks.each { |block| block.call(@pages) }
end
#
# Execute the on_every_page blocks for *page*
#
def do_page_blocks(page)
- @on_every_page_blocks.each do |blk|
- blk.call(page)
+ @on_every_page_blocks.each do |block|
+ block.call(page)
end
- @on_pages_like_blocks.each do |pattern, blks|
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
+ @on_pages_like_blocks.each do |pattern, blocks|
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
end
end
#
# Return an Array of links to follow from the given page.
@@ -239,10 +255,10 @@
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
- @skip_link_patterns.any? { |p| link.path =~ p }
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
end
end
end