lib/anemone/core.rb in spk-anemone-0.3.1 vs lib/anemone/core.rb in spk-anemone-0.4.0
- old
+ new
@@ -5,11 +5,11 @@
require 'anemone/page_store'
require 'anemone/storage'
module Anemone
- VERSION = '0.3.1';
+ VERSION = '0.4.0';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
@@ -40,18 +40,22 @@
:depth_limit => false,
# number of times HTTP redirects will be followed
:redirect_limit => 5,
# storage engine defaults to Hash in +process_options+ if none specified
:storage => nil,
+ # Hash of cookie name => value to send with HTTP requests
+ :cookies => nil,
+ # accept cookies from the server and send them back?
+ :accept_cookies => false,
# Authentication
:authorization => nil,
}
# Create setter methods for all options to be called from the crawl block
DEFAULT_OPTS.keys.each do |key|
- define_method "#{key}=" do |*args|
- @opts[key.to_sym] = *args
+ define_method "#{key}=" do |value|
+ @opts[key.to_sym] = value
end
end
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
@@ -176,11 +180,11 @@
break
end
end
end
- @tentacles.each { |t| t.join }
+ @tentacles.each { |thread| thread.join }
do_after_crawl_blocks
self
end
private
@@ -189,12 +193,24 @@
@opts = DEFAULT_OPTS.merge @opts
authorization(@opts[:authorization]) if @opts[:authorization]
@opts[:threads] = 1 if @opts[:delay] > 0
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+
+ freeze_options
end
+ #
+ # Freeze the opts Hash so that no options can be modified
+ # once the crawl begins
+ #
+ def freeze_options
+ @opts.freeze
+ @opts.each_key { |key| @opts[key].freeze }
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
+ end
+
# Generate Authorization string and set authorization opts
def authorization(auth=nil)
require 'base64'
if auth.is_a?(String) && auth.include?(':')
self.authorization = "Basic #{Base64.b64encode(auth)}"
@@ -211,23 +227,23 @@
#
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
- @after_crawl_blocks.each { |b| b.call(@pages) }
+ @after_crawl_blocks.each { |block| block.call(@pages) }
end
#
# Execute the on_every_page blocks for *page*
#
def do_page_blocks(page)
- @on_every_page_blocks.each do |blk|
- blk.call(page)
+ @on_every_page_blocks.each do |block|
+ block.call(page)
end
- @on_pages_like_blocks.each do |pattern, blks|
- blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
+ @on_pages_like_blocks.each do |pattern, blocks|
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
end
end
#
# Return an Array of links to follow from the given page.
@@ -261,10 +277,10 @@
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
- @skip_link_patterns.any? { |p| link.path =~ p }
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
end
end
end