lib/anemone/core.rb in anemone-0.4.0 vs lib/anemone/core.rb in anemone-0.5.0

- old
+ new

@@ -1,15 +1,17 @@ require 'thread' require 'robots' require 'anemone/tentacle' require 'anemone/page' +require 'anemone/exceptions' require 'anemone/page_store' require 'anemone/storage' +require 'anemone/storage/base' module Anemone - VERSION = '0.4.0'; + VERSION = '0.5.0'; # # Convenience method to start a crawl # def Anemone.crawl(urls, options = {}, &block) @@ -43,11 +45,13 @@ # storage engine defaults to Hash in +process_options+ if none specified :storage => nil, # Hash of cookie name => value to send with HTTP requests :cookies => nil, # accept cookies from the server and send them back? - :accept_cookies => false + :accept_cookies => false, + # skip any link with a query string? e.g. http://foo.com/?u=user + :skip_query_strings => false } # Create setter methods for all options to be called from the crawl block DEFAULT_OPTS.keys.each do |key| define_method "#{key}=" do |value| @@ -185,11 +189,12 @@ private def process_options @opts = DEFAULT_OPTS.merge @opts @opts[:threads] = 1 if @opts[:delay] > 0 - @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash) + storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash) + @pages = PageStore.new(storage) @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt] freeze_options end @@ -239,18 +244,43 @@ # and is not excluded by robots.txt... # and is not deeper than the depth limit # Returns +false+ otherwise. # def visit_link?(link, from_page = nil) - allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true + !@pages.has_page?(link) && + !skip_link?(link) && + !skip_query_string?(link) && + allowed(link) && + !too_deep?(from_page) + end + # + # Returns +true+ if we are obeying robots.txt and the link + # is granted access in it. Always returns +true+ when we are + # not obeying robots.txt. + # + def allowed(link) + @opts[:obey_robots_txt] ? @robots.allowed?(link) : true + end + + # + # Returns +true+ if we are over the page depth limit. + # This only works when coming from a page and with the +depth_limit+ option set. + # When neither is the case, will always return +false+. + def too_deep?(from_page) if from_page && @opts[:depth_limit] - too_deep = from_page.depth >= @opts[:depth_limit] + from_page.depth >= @opts[:depth_limit] else - too_deep = false + false end - - !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep + end + + # + # Returns +true+ if *link* should not be visited because + # it has a query string and +skip_query_strings+ is true. + # + def skip_query_string?(link) + @opts[:skip_query_strings] && link.query end # # Returns +true+ if *link* should not be visited because # its URL matches a skip_link pattern.