lib/anemone/core.rb in anemone-0.2.0 vs lib/anemone/core.rb in anemone-0.2.1

- old
+ new

@@ -1,8 +1,9 @@ require 'net/http' require 'thread' require 'anemone/tentacle' +require 'anemone/page' require 'anemone/page_hash' module Anemone class Core # PageHash storing all Page objects encountered during the crawl @@ -10,36 +11,35 @@ # # Initialize the crawl with starting *urls* (single URL or Array of URLs) # and optional *block* # - def initialize(urls, &block) - @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) } + def initialize(urls) + @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } - + @tentacles = [] @pages = PageHash.new @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] @after_crawl_blocks = [] if Anemone.options.obey_robots_txt @robots = Robots.new(Anemone.options.user_agent) end - - block.call(self) if block + + yield self if block_given? end # # Convenience method to start a new crawl # - def self.crawl(root, &block) + def self.crawl(root) self.new(root) do |core| - block.call(core) if block + yield core if block_given? core.run - return core end end # # Add a block to be executed on the PageHash after the crawl @@ -102,11 +102,11 @@ return if @urls.empty? link_queue = Queue.new page_queue = Queue.new - Anemone.options.threads.times do |id| + Anemone.options.threads.times do @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run } end @urls.each{ |url| link_queue.enq(url) } @@ -118,11 +118,11 @@ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose # perform the on_every_page blocks for this page do_page_blocks(page) - page.doc = nil if Anemone.options.discard_page_bodies + page.discard_doc! if Anemone.options.discard_page_bodies links_to_follow(page).each do |link| link_queue.enq([link, page]) @pages[link] = nil end @@ -141,11 +141,11 @@ until link_queue.num_waiting == @tentacles.size Thread.pass end if page_queue.empty? - @tentacles.size.times { |i| link_queue.enq(:END)} + @tentacles.size.times { link_queue.enq(:END)} break end end end @@ -205,19 +205,19 @@ too_deep = from_page.depth >= Anemone.options.depth_limit rescue false else too_deep = false end - !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep + !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep end # # Returns +true+ if *link* should not be visited because # its URL matches a skip_link pattern. # def skip_link?(link) @skip_link_patterns.each { |p| return true if link.path =~ p} - return false + false end end end