lib/anemone/core.rb in anemone-0.0.2 vs lib/anemone/core.rb in anemone-0.0.3
- old
+ new
@@ -1,179 +1,181 @@
-require 'net/http'
-require 'thread'
-require 'anemone/tentacle'
-require 'anemone/page_hash'
-
-module Anemone
- class Core
- # PageHash storing all Page objects encountered during the crawl
- attr_reader :pages
-
- #
- # Initialize the crawl with a starting *url*, *options*, and optional *block*
- #
- def initialize(url, &block)
- url = URI(url) if url.is_a?(String)
- @url = url
- @tentacles = []
- @pages = PageHash.new
- @on_every_page_blocks = []
- @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
- @skip_link_patterns = []
- @after_crawl_blocks = []
-
- block.call(self) if block
- end
-
- #
- # Convenience method to start a new crawl
- #
- def self.crawl(root, &block)
- self.new(root) do |core|
- block.call(core) if block
- core.run
- core.do_after_crawl_blocks
- return core
- end
- end
-
- #
- # Add a block to be executed on the PageHash after the crawl
- # is finished
- #
- def after_crawl(&block)
- @after_crawl_blocks << block
- self
- end
-
- #
- # Add one ore more Regex patterns for URLs which should not be
- # followed
- #
- def skip_links_like(*patterns)
- if patterns
- patterns.each do |pattern|
- @skip_link_patterns << pattern
- end
- end
- self
- end
-
- #
- # Add a block to be executed on every Page as they are encountered
- # during the crawl
- #
- def on_every_page(&block)
- @on_every_page_blocks << block
- self
- end
-
- #
- # Add a block to be executed on Page objects with a URL matching
- # one or more patterns
- #
- def on_pages_like(*patterns, &block)
- if patterns
- patterns.each do |pattern|
- @on_pages_like_blocks[pattern] << block
- end
- end
- self
- end
-
- #
- # Perform the crawl
- #
- def run
- link_queue = Queue.new
- page_queue = Queue.new
-
- Anemone.options.threads.times do |id|
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
- end
-
- return if !visit_link?(@url)
-
- link_queue.enq(@url)
-
- while true do
- page = page_queue.deq
-
- @pages[page.url] = page
-
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
-
- do_page_blocks(page)
-
- page.links.each do |link|
- if visit_link?(link)
- link_queue.enq(link)
- @pages[link] = nil
- end
- end
-
- page.aliases.each do |aka|
- if !@pages.has_key?(aka) or @pages[aka].nil?
- @pages[aka] = page.alias_clone(aka)
- end
- @pages[aka].add_alias!(page.url)
- end
-
- # if we are done with the crawl, tell the threads to end
- if link_queue.empty? and page_queue.empty?
- until link_queue.num_waiting == @tentacles.size
- Thread.pass
- end
-
- if page_queue.empty?
- @tentacles.size.times { |i| link_queue.enq(:END)}
- break
- end
- end
-
- end
-
- @tentacles.each { |t| t.join }
-
- self
- end
-
- #
- # Execute the after_crawl blocks
- #
- def do_after_crawl_blocks
- @after_crawl_blocks.each {|b| b.call(@pages)}
- end
-
- #
- # Execute the on_every_page blocks for *page*
- #
- def do_page_blocks(page)
- @on_every_page_blocks.each do |blk|
- blk.call(page)
- end
-
- @on_pages_like_blocks.each do |pattern, blk|
- blk.call(page) if page.url.to_s =~ pattern
- end
- end
-
- #
- # Returns +true+ if *link* has not been visited already,
- # and is not excluded by a skip_link pattern. Returns
- # +false+ otherwise.
- #
- def visit_link?(link)
- !@pages.has_key?(link) and !skip_link?(link)
- end
-
- #
- # Returns +true+ if *link* should not be visited because
- # its URL matches a skip_link pattern.
- #
- def skip_link?(link)
- @skip_link_patterns.each { |p| return true if link.path =~ p}
- return false
- end
-
- end
-end
\ No newline at end of file
+require 'net/http'
+require 'thread'
+require 'anemone/tentacle'
+require 'anemone/page_hash'
+
+module Anemone
+ class Core
+ # PageHash storing all Page objects encountered during the crawl
+ attr_reader :pages
+
+ #
+ # Initialize the crawl with a starting *url*, *options*, and optional *block*
+ #
+ def initialize(url, &block)
+ url = URI(url) if url.is_a?(String)
+ @url = url
+ @tentacles = []
+ @pages = PageHash.new
+ @on_every_page_blocks = []
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
+ @skip_link_patterns = []
+ @after_crawl_blocks = []
+
+ block.call(self) if block
+ end
+
+ #
+ # Convenience method to start a new crawl
+ #
+ def self.crawl(root, &block)
+ self.new(root) do |core|
+ block.call(core) if block
+ core.run
+ core.do_after_crawl_blocks
+ return core
+ end
+ end
+
+ #
+ # Add a block to be executed on the PageHash after the crawl
+ # is finished
+ #
+ def after_crawl(&block)
+ @after_crawl_blocks << block
+ self
+ end
+
+ #
+ # Add one ore more Regex patterns for URLs which should not be
+ # followed
+ #
+ def skip_links_like(*patterns)
+ if patterns
+ patterns.each do |pattern|
+ @skip_link_patterns << pattern
+ end
+ end
+ self
+ end
+
+ #
+ # Add a block to be executed on every Page as they are encountered
+ # during the crawl
+ #
+ def on_every_page(&block)
+ @on_every_page_blocks << block
+ self
+ end
+
+ #
+ # Add a block to be executed on Page objects with a URL matching
+ # one or more patterns
+ #
+ def on_pages_like(*patterns, &block)
+ if patterns
+ patterns.each do |pattern|
+ @on_pages_like_blocks[pattern] << block
+ end
+ end
+ self
+ end
+
+ #
+ # Perform the crawl
+ #
+ def run
+ link_queue = Queue.new
+ page_queue = Queue.new
+
+ Anemone.options.threads.times do |id|
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
+ end
+
+ return if !visit_link?(@url)
+
+ link_queue.enq(@url)
+
+ while true do
+ page = page_queue.deq
+
+ @pages[page.url] = page
+
+ puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
+
+ do_page_blocks(page)
+
+ page.links.each do |link|
+ if visit_link?(link)
+ link_queue.enq(link)
+ @pages[link] = nil
+ end
+ end
+
+ page.aliases.each do |aka|
+ if !@pages.has_key?(aka) or @pages[aka].nil?
+ @pages[aka] = page.alias_clone(aka)
+ end
+ @pages[aka].add_alias!(page.url)
+ end
+
+ # if we are done with the crawl, tell the threads to end
+ if link_queue.empty? and page_queue.empty?
+ until link_queue.num_waiting == @tentacles.size
+ Thread.pass
+ end
+
+ if page_queue.empty?
+ @tentacles.size.times { |i| link_queue.enq(:END)}
+ break
+ end
+ end
+
+ end
+
+ @tentacles.each { |t| t.join }
+
+ self
+ end
+
+ #
+ # Execute the after_crawl blocks
+ #
+ def do_after_crawl_blocks
+ @after_crawl_blocks.each {|b| b.call(@pages)}
+ end
+
+ #
+ # Execute the on_every_page blocks for *page*
+ #
+ def do_page_blocks(page)
+ @on_every_page_blocks.each do |blk|
+ blk.call(page)
+ end
+
+ @on_pages_like_blocks.each do |pattern, blks|
+ if page.url.to_s =~ pattern
+ blks.each { |blk| blk.call(page) }
+ end
+ end
+ end
+
+ #
+ # Returns +true+ if *link* has not been visited already,
+ # and is not excluded by a skip_link pattern. Returns
+ # +false+ otherwise.
+ #
+ def visit_link?(link)
+ !@pages.has_key?(link) and !skip_link?(link)
+ end
+
+ #
+ # Returns +true+ if *link* should not be visited because
+ # its URL matches a skip_link pattern.
+ #
+ def skip_link?(link)
+ @skip_link_patterns.each { |p| return true if link.path =~ p}
+ return false
+ end
+
+ end
+end