lib/anemone/core.rb in spk-anemone-0.2.4 vs lib/anemone/core.rb in spk-anemone-0.3.0
- old
+ new
@@ -1,28 +1,29 @@
require 'thread'
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
-require 'anemone/page_hash'
+require 'anemone/page_store'
+require 'anemone/storage'
module Anemone
- VERSION = '0.2.4';
+ VERSION = '0.3.0';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
Core.crawl(urls, options, &block)
end
class Core
- # PageHash storing all Page objects encountered during the crawl
- attr_reader :pages
+ # PageStore storing all Page objects encountered during the crawl
+ attr_reader :pages
# Hash of options for the crawl
- attr_accessor :opts
+ attr_reader :opts
DEFAULT_OPTS = {
# run 4 Tentacle threads to fetch pages
:threads => 4,
# disable verbose output
@@ -37,33 +38,37 @@
:obey_robots_txt => false,
# by default, don't limit the depth of the crawl
:depth_limit => false,
# number of times HTTP redirects will be followed
:redirect_limit => 5,
+ # storage engine defaults to Hash in +process_options+ if none specified
+ :storage => nil,
# Authentication
:authorization => nil,
}
+ # Create setter methods for all options to be called from the crawl block
+ DEFAULT_OPTS.keys.each do |key|
+ define_method "#{key}=" do |*args|
+ @opts[key.to_sym] = *args
+ end
+ end
+
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
def initialize(urls, opts = {})
- process_options opts
-
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
- @urls.each{ |url|
- url.path = '/' if url.path.empty?
- authorization(url) if url.user
- }
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
@tentacles = []
- @pages = PageHash.new
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
+ @opts = opts
yield self if block_given?
end
#
@@ -75,11 +80,11 @@
core.run
end
end
#
- # Add a block to be executed on the PageHash after the crawl
+ # Add a block to be executed on the PageStore after the crawl
# is finished
#
def after_crawl(&block)
@after_crawl_blocks << block
self
@@ -127,103 +132,90 @@
#
# Perform the crawl
#
def run
+ process_options
+
@urls.delete_if { |url| !visit_link?(url) }
return if @urls.empty?
link_queue = Queue.new
page_queue = Queue.new
@opts[:threads].times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
- @urls.each{ |url| link_queue.enq(url) }
+ @urls.each{ |url|
+ link_queue.enq(url)
+ authorization(url) if url.user
+ }
loop do
page = page_queue.deq
-
- @pages[page.url] = page
-
+ @pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
-
- # perform the on_every_page blocks for this page
- do_page_blocks(page)
-
+ do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]
- links_to_follow(page).each do |link|
- link_queue.enq([link, page])
- @pages[link] = nil
+ links = links_to_follow page
+ links.each do |link|
+ link_queue << [link, page.url.dup, page.depth + 1]
end
+ @pages.touch_keys links
- # create an entry in the page hash for each alias of this page,
- # i.e. all the pages that redirected to this page
- page.aliases.each do |aka|
- if !@pages.has_key?(aka) or @pages[aka].nil?
- @pages[aka] = page.alias_clone(aka)
- end
- @pages[aka].add_alias!(page.url)
- end
+ @pages[page.url] = page
# if we are done with the crawl, tell the threads to end
if link_queue.empty? and page_queue.empty?
until link_queue.num_waiting == @tentacles.size
Thread.pass
end
-
if page_queue.empty?
- @tentacles.size.times { link_queue.enq(:END)}
+ @tentacles.size.times { link_queue << :END }
break
end
end
-
end
@tentacles.each { |t| t.join }
-
- do_after_crawl_blocks()
-
+ do_after_crawl_blocks
self
end
private
- def process_options(options)
- @opts = DEFAULT_OPTS.merge options
-
- authorization(@opts[:authorization])
-
+ def process_options
+ @opts = DEFAULT_OPTS.merge @opts
+ authorization(@opts[:authorization]) if @opts[:authorization]
@opts[:threads] = 1 if @opts[:delay] > 0
-
+ @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
end
# Generate Authorization string only if not already set
def authorization(auth=nil)
- return if @opts[:authorization] =~ /^Basic .*/
require 'base64'
if auth.is_a?(String) && auth.include?(':')
- @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
+ self.authorization = "Basic #{Base64.b64encode(auth)}"
elsif auth.is_a?(Array)
user = auth.first
password = auth.last
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
elsif auth.is_a?(URI)
user = auth.user
password = auth.password
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
end
end
#
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
- @after_crawl_blocks.each {|b| b.call(@pages)}
+ @after_crawl_blocks.each { |b| b.call(@pages) }
end
#
# Execute the on_every_page blocks for *page*
#
@@ -231,23 +223,21 @@
@on_every_page_blocks.each do |blk|
blk.call(page)
end
@on_pages_like_blocks.each do |pattern, blks|
- if page.url.to_s =~ pattern
- blks.each { |blk| blk.call(page) }
- end
+ blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
end
end
#
# Return an Array of links to follow from the given page.
# Based on whether or not the link has already been crawled,
# and the block given to focus_crawl()
#
def links_to_follow(page)
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
- links.select { |link| visit_link?(link, page) }
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
end
#
# Returns +true+ if *link* has not been visited already,
# and is not excluded by a skip_link pattern...