lib/anemone/core.rb in anemone-0.4.0 vs lib/anemone/core.rb in anemone-0.5.0
- old
+ new
@@ -1,15 +1,17 @@
require 'thread'
require 'robots'
require 'anemone/tentacle'
require 'anemone/page'
+require 'anemone/exceptions'
require 'anemone/page_store'
require 'anemone/storage'
+require 'anemone/storage/base'
module Anemone
- VERSION = '0.4.0';
+ VERSION = '0.5.0';
#
# Convenience method to start a crawl
#
def Anemone.crawl(urls, options = {}, &block)
@@ -43,11 +45,13 @@
# storage engine defaults to Hash in +process_options+ if none specified
:storage => nil,
# Hash of cookie name => value to send with HTTP requests
:cookies => nil,
# accept cookies from the server and send them back?
- :accept_cookies => false
+ :accept_cookies => false,
+ # skip any link with a query string? e.g. http://foo.com/?u=user
+ :skip_query_strings => false
}
# Create setter methods for all options to be called from the crawl block
DEFAULT_OPTS.keys.each do |key|
define_method "#{key}=" do |value|
@@ -185,11 +189,12 @@
private
def process_options
@opts = DEFAULT_OPTS.merge @opts
@opts[:threads] = 1 if @opts[:delay] > 0
- @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
+ storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
+ @pages = PageStore.new(storage)
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
freeze_options
end
@@ -239,18 +244,43 @@
# and is not excluded by robots.txt...
# and is not deeper than the depth limit
# Returns +false+ otherwise.
#
def visit_link?(link, from_page = nil)
- allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+ !@pages.has_page?(link) &&
+ !skip_link?(link) &&
+ !skip_query_string?(link) &&
+ allowed(link) &&
+ !too_deep?(from_page)
+ end
+ #
+ # Returns +true+ if we are obeying robots.txt and the link
+ # is granted access in it. Always returns +true+ when we are
+ # not obeying robots.txt.
+ #
+ def allowed(link)
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
+ end
+
+ #
+ # Returns +true+ if we are over the page depth limit.
+ # This only works when coming from a page and with the +depth_limit+ option set.
+ # When neither is the case, will always return +false+.
+ def too_deep?(from_page)
if from_page && @opts[:depth_limit]
- too_deep = from_page.depth >= @opts[:depth_limit]
+ from_page.depth >= @opts[:depth_limit]
else
- too_deep = false
+ false
end
-
- !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
+ end
+
+ #
+ # Returns +true+ if *link* should not be visited because
+ # it has a query string and +skip_query_strings+ is true.
+ #
+ def skip_query_string?(link)
+ @opts[:skip_query_strings] && link.query
end
#
# Returns +true+ if *link* should not be visited because
# its URL matches a skip_link pattern.