lib/anemone/core.rb in anemone-0.2.0 vs lib/anemone/core.rb in anemone-0.2.1
- old
+ new
@@ -1,8 +1,9 @@
require 'net/http'
require 'thread'
require 'anemone/tentacle'
+require 'anemone/page'
require 'anemone/page_hash'
module Anemone
class Core
# PageHash storing all Page objects encountered during the crawl
@@ -10,36 +11,35 @@
#
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
# and optional *block*
#
- def initialize(urls, &block)
- @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
+ def initialize(urls)
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
-
+
@tentacles = []
@pages = PageHash.new
@on_every_page_blocks = []
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
@skip_link_patterns = []
@after_crawl_blocks = []
if Anemone.options.obey_robots_txt
@robots = Robots.new(Anemone.options.user_agent)
end
-
- block.call(self) if block
+
+ yield self if block_given?
end
#
# Convenience method to start a new crawl
#
- def self.crawl(root, &block)
+ def self.crawl(root)
self.new(root) do |core|
- block.call(core) if block
+ yield core if block_given?
core.run
- return core
end
end
#
# Add a block to be executed on the PageHash after the crawl
@@ -102,11 +102,11 @@
return if @urls.empty?
link_queue = Queue.new
page_queue = Queue.new
- Anemone.options.threads.times do |id|
+ Anemone.options.threads.times do
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
end
@urls.each{ |url| link_queue.enq(url) }
@@ -118,11 +118,11 @@
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
# perform the on_every_page blocks for this page
do_page_blocks(page)
- page.doc = nil if Anemone.options.discard_page_bodies
+ page.discard_doc! if Anemone.options.discard_page_bodies
links_to_follow(page).each do |link|
link_queue.enq([link, page])
@pages[link] = nil
end
@@ -141,11 +141,11 @@
until link_queue.num_waiting == @tentacles.size
Thread.pass
end
if page_queue.empty?
- @tentacles.size.times { |i| link_queue.enq(:END)}
+ @tentacles.size.times { link_queue.enq(:END)}
break
end
end
end
@@ -205,19 +205,19 @@
too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
else
too_deep = false
end
- !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
end
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.
#
def skip_link?(link)
@skip_link_patterns.each { |p| return true if link.path =~ p}
- return false
+ false
end
end
end