lib/anemone/core.rb in anemone-0.2.2 vs lib/anemone/core.rb in anemone-0.2.3
- old
+ new
@@ -1,43 +1,73 @@
-require 'net/http'
require 'thread'
+require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
require 'anemone/page_hash'
module Anemone
+
+ VERSION = '0.2.3';
+
+ #
+ # Convenience method to start a crawl
+ #
+ def Anemone.crawl(urls, options = {}, &block)
+ Core.crawl(urls, options, &block)
+ end
+
class Core
# PageHash storing all Page objects encountered during the crawl
attr_reader :pages
-
+
+ # Hash of options for the crawl
+ attr_accessor :opts
+
+ DEFAULT_OPTS = {
+ # run 4 Tentacle threads to fetch pages
+ :threads => 4,
+ # disable verbose output
+ :verbose => false,
+ # don't throw away the page response body after scanning it for links
+ :discard_page_bodies => false,
+ # identify self as Anemone/VERSION
+ :user_agent => "Anemone/#{Anemone::VERSION}",
+ # no delay between requests
+ :delay => 0,
+ # don't obey the robots exclusion protocol
+ :obey_robots_txt => false,
+ # by default, don't limit the depth of the crawl
+ :depth_limit => false,
+ # number of times HTTP redirects will be followed
+ :redirect_limit => 5
+ }
+
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
- def initialize(urls)
+ def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@tentacles = []
@pages = PageHash.new
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
-
- if Anemone.options.obey_robots_txt
- @robots = Robots.new(Anemone.options.user_agent)
- end
+ process_options opts
+
yield self if block_given?
end
#
# Convenience method to start a new crawl
#
- def self.crawl(root)
- self.new(root) do |core|
+ def self.crawl(urls, opts = {})
+ self.new(urls, opts) do |core|
yield core if block_given?
core.run
end
end
@@ -53,15 +83,11 @@
#
# Add one ore more Regex patterns for URLs which should not be
# followed
#
def skip_links_like(*patterns)
- if patterns
- patterns.each do |pattern|
- @skip_link_patterns << pattern
- end
- end
+ @skip_link_patterns.concat [patterns].flatten.compact
self
end
#
# Add a block to be executed on every Page as they are encountered
@@ -102,27 +128,27 @@
return if @urls.empty?
link_queue = Queue.new
page_queue = Queue.new
- Anemone.options.threads.times do
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+ @opts[:threads].times do
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
end
@urls.each{ |url| link_queue.enq(url) }
loop do
page = page_queue.deq
@pages[page.url] = page
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
# perform the on_every_page blocks for this page
do_page_blocks(page)
- page.discard_doc! if Anemone.options.discard_page_bodies
+ page.discard_doc! if @opts[:discard_page_bodies]
links_to_follow(page).each do |link|
link_queue.enq([link, page])
@pages[link] = nil
end
@@ -156,11 +182,19 @@
self
end
private
-
+
+ def process_options(options)
+ @opts = DEFAULT_OPTS.merge options
+
+ @opts[:threads] = 1 if @opts[:delay] > 0
+
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
+ end
+
#
# Execute the after_crawl blocks
#
def do_after_crawl_blocks
@after_crawl_blocks.each {|b| b.call(@pages)}
@@ -197,14 +231,14 @@
# and is not excluded by robots.txt...
# and is not deeper than the depth limit
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
- allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
- if from_page
- too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
+ if from_page && @opts[:depth_limit]
+ too_deep = from_page.depth >= @opts[:depth_limit]
else
too_deep = false
end
!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
@@ -213,11 +247,10 @@
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
- @skip_link_patterns.each { |p| return true if link.path =~ p}
- false
+ @skip_link_patterns.any? { |p| link.path =~ p }
end
end
end