lib/anemone/core.rb in anemone-0.0.2 vs lib/anemone/core.rb in anemone-0.0.3

- old
+ new

@@ -1,179 +1,181 @@ -require 'net/http' -require 'thread' -require 'anemone/tentacle' -require 'anemone/page_hash' - -module Anemone - class Core - # PageHash storing all Page objects encountered during the crawl - attr_reader :pages - - # - # Initialize the crawl with a starting *url*, *options*, and optional *block* - # - def initialize(url, &block) - url = URI(url) if url.is_a?(String) - @url = url - @tentacles = [] - @pages = PageHash.new - @on_every_page_blocks = [] - @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } - @skip_link_patterns = [] - @after_crawl_blocks = [] - - block.call(self) if block - end - - # - # Convenience method to start a new crawl - # - def self.crawl(root, &block) - self.new(root) do |core| - block.call(core) if block - core.run - core.do_after_crawl_blocks - return core - end - end - - # - # Add a block to be executed on the PageHash after the crawl - # is finished - # - def after_crawl(&block) - @after_crawl_blocks << block - self - end - - # - # Add one ore more Regex patterns for URLs which should not be - # followed - # - def skip_links_like(*patterns) - if patterns - patterns.each do |pattern| - @skip_link_patterns << pattern - end - end - self - end - - # - # Add a block to be executed on every Page as they are encountered - # during the crawl - # - def on_every_page(&block) - @on_every_page_blocks << block - self - end - - # - # Add a block to be executed on Page objects with a URL matching - # one or more patterns - # - def on_pages_like(*patterns, &block) - if patterns - patterns.each do |pattern| - @on_pages_like_blocks[pattern] << block - end - end - self - end - - # - # Perform the crawl - # - def run - link_queue = Queue.new - page_queue = Queue.new - - Anemone.options.threads.times do |id| - @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run } - end - - return if !visit_link?(@url) - - link_queue.enq(@url) - - while true do - page = page_queue.deq - - @pages[page.url] = page - - puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose - - do_page_blocks(page) - - page.links.each do |link| - if visit_link?(link) - link_queue.enq(link) - @pages[link] = nil - end - end - - page.aliases.each do |aka| - if !@pages.has_key?(aka) or @pages[aka].nil? - @pages[aka] = page.alias_clone(aka) - end - @pages[aka].add_alias!(page.url) - end - - # if we are done with the crawl, tell the threads to end - if link_queue.empty? and page_queue.empty? - until link_queue.num_waiting == @tentacles.size - Thread.pass - end - - if page_queue.empty? - @tentacles.size.times { |i| link_queue.enq(:END)} - break - end - end - - end - - @tentacles.each { |t| t.join } - - self - end - - # - # Execute the after_crawl blocks - # - def do_after_crawl_blocks - @after_crawl_blocks.each {|b| b.call(@pages)} - end - - # - # Execute the on_every_page blocks for *page* - # - def do_page_blocks(page) - @on_every_page_blocks.each do |blk| - blk.call(page) - end - - @on_pages_like_blocks.each do |pattern, blk| - blk.call(page) if page.url.to_s =~ pattern - end - end - - # - # Returns +true+ if *link* has not been visited already, - # and is not excluded by a skip_link pattern. Returns - # +false+ otherwise. - # - def visit_link?(link) - !@pages.has_key?(link) and !skip_link?(link) - end - - # - # Returns +true+ if *link* should not be visited because - # its URL matches a skip_link pattern. - # - def skip_link?(link) - @skip_link_patterns.each { |p| return true if link.path =~ p} - return false - end - - end -end \ No newline at end of file +require 'net/http' +require 'thread' +require 'anemone/tentacle' +require 'anemone/page_hash' + +module Anemone + class Core + # PageHash storing all Page objects encountered during the crawl + attr_reader :pages + + # + # Initialize the crawl with a starting *url*, *options*, and optional *block* + # + def initialize(url, &block) + url = URI(url) if url.is_a?(String) + @url = url + @tentacles = [] + @pages = PageHash.new + @on_every_page_blocks = [] + @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } + @skip_link_patterns = [] + @after_crawl_blocks = [] + + block.call(self) if block + end + + # + # Convenience method to start a new crawl + # + def self.crawl(root, &block) + self.new(root) do |core| + block.call(core) if block + core.run + core.do_after_crawl_blocks + return core + end + end + + # + # Add a block to be executed on the PageHash after the crawl + # is finished + # + def after_crawl(&block) + @after_crawl_blocks << block + self + end + + # + # Add one ore more Regex patterns for URLs which should not be + # followed + # + def skip_links_like(*patterns) + if patterns + patterns.each do |pattern| + @skip_link_patterns << pattern + end + end + self + end + + # + # Add a block to be executed on every Page as they are encountered + # during the crawl + # + def on_every_page(&block) + @on_every_page_blocks << block + self + end + + # + # Add a block to be executed on Page objects with a URL matching + # one or more patterns + # + def on_pages_like(*patterns, &block) + if patterns + patterns.each do |pattern| + @on_pages_like_blocks[pattern] << block + end + end + self + end + + # + # Perform the crawl + # + def run + link_queue = Queue.new + page_queue = Queue.new + + Anemone.options.threads.times do |id| + @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run } + end + + return if !visit_link?(@url) + + link_queue.enq(@url) + + while true do + page = page_queue.deq + + @pages[page.url] = page + + puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose + + do_page_blocks(page) + + page.links.each do |link| + if visit_link?(link) + link_queue.enq(link) + @pages[link] = nil + end + end + + page.aliases.each do |aka| + if !@pages.has_key?(aka) or @pages[aka].nil? + @pages[aka] = page.alias_clone(aka) + end + @pages[aka].add_alias!(page.url) + end + + # if we are done with the crawl, tell the threads to end + if link_queue.empty? and page_queue.empty? + until link_queue.num_waiting == @tentacles.size + Thread.pass + end + + if page_queue.empty? + @tentacles.size.times { |i| link_queue.enq(:END)} + break + end + end + + end + + @tentacles.each { |t| t.join } + + self + end + + # + # Execute the after_crawl blocks + # + def do_after_crawl_blocks + @after_crawl_blocks.each {|b| b.call(@pages)} + end + + # + # Execute the on_every_page blocks for *page* + # + def do_page_blocks(page) + @on_every_page_blocks.each do |blk| + blk.call(page) + end + + @on_pages_like_blocks.each do |pattern, blks| + if page.url.to_s =~ pattern + blks.each { |blk| blk.call(page) } + end + end + end + + # + # Returns +true+ if *link* has not been visited already, + # and is not excluded by a skip_link pattern. Returns + # +false+ otherwise. + # + def visit_link?(link) + !@pages.has_key?(link) and !skip_link?(link) + end + + # + # Returns +true+ if *link* should not be visited because + # its URL matches a skip_link pattern. + # + def skip_link?(link) + @skip_link_patterns.each { |p| return true if link.path =~ p} + return false + end + + end +end