lib/anemone/core.rb in anemone-0.0.6 vs lib/anemone/core.rb in anemone-0.1.0
- old
+ new
@@ -7,16 +7,17 @@
class Core
# PageHash storing all Page objects encountered during the crawl
attr_reader :pages
#
- # Initialize the crawl with a starting *url*, *options*, and optional *block*
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
+ # and optional *block*
#
- def initialize(url, &block)
- url = URI(url) if url.is_a?(String)
- @url = url
- @url.path = "/" if @url.path.empty?
+ def initialize(urls, &block)
+ @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
+
@tentacles = []
@pages = PageHash.new
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@@ -79,42 +80,53 @@
end
self
end
#
+ # Specify a block which will select which links to follow on each page.
+ # The block should return an Array of URI objects.
+ #
+ def focus_crawl(&block)
+ @focus_crawl_block = block
+ self
+ end
+
+ #
# Perform the crawl
#
def run
+ @urls.delete_if { |url| !visit_link?(url) }
+ return if @urls.empty?
+
link_queue = Queue.new
page_queue = Queue.new
Anemone.options.threads.times do |id|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
end
- return if !visit_link?(@url)
-
- link_queue.enq(@url)
+ @urls.each{ |url| link_queue.enq(url) }
- while true do
+ loop do
page = page_queue.deq
@pages[page.url] = page
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+ #perform the on_every_page blocks for this page
do_page_blocks(page)
page.doc = nil if Anemone.options.discard_page_bodies
- page.links.each do |link|
- if visit_link?(link)
- link_queue.enq(link)
- @pages[link] = nil
- end
+ links_to_follow(page).each do |link|
+ link_queue.enq(link)
+ @pages[link] = nil
end
+ #create an entry in the page hash for each alias of this page,
+ #i.e. all the pages that redirected to this page
page.aliases.each do |aka|
if !@pages.has_key?(aka) or @pages[aka].nil?
@pages[aka] = page.alias_clone(aka)
end
@pages[aka].add_alias!(page.url)
@@ -162,9 +174,19 @@
if page.url.to_s =~ pattern
blks.each { |blk| blk.call(page) }
end
end
end
+
+ #
+ # Return an Array of links to follow from the given page.
+ # Based on whether or not the link has already been crawled,
+ # and the block given to focus_crawl()
+ #
+ def links_to_follow(page)
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
+ links.find_all { |link| visit_link?(link) }
+ end
#
# Returns +true+ if *link* has not been visited already,
# and is not excluded by a skip_link pattern. Returns
# +false+ otherwise.