lib/spidr/agent.rb in spidr-0.3.1 vs lib/spidr/agent.rb in spidr-0.3.2

- old
+ new

@@ -113,27 +113,30 @@ if options[:host_headers] @host_headers.merge!(options[:host_headers]) end - @user_agent = (options[:user_agent] || Spidr.user_agent) + @user_agent = options.fetch(:user_agent,Spidr.user_agent) @referer = options[:referer] - @sessions = SessionCache.new(options[:proxy] || Spidr.proxy) + @sessions = SessionCache.new(options.fetch(:proxy,Spidr.proxy)) @cookies = CookieJar.new @authorized = AuthStore.new @running = false - @delay = (options[:delay] || 0) + @delay = options.fetch(:delay,0) @history = Set[] @failures = Set[] @queue = [] @levels = Hash.new(0) @max_depth = options[:max_depth] - super(options) + initialize_sanitizers(options) + initialize_filters(options) + initialize_actions(options) + initialize_events(options) yield self if block_given? end # @@ -150,23 +153,20 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # - def self.start_at(url,options={}) - self.new(options) do |spider| - yield spider if block_given? - - spider.start_at(url) - end + def self.start_at(url,options={},&block) + agent = new(options,&block) + agent.start_at(url) end # - # Creates a new agent and spiders the given host. + # Creates a new agent and spiders the web-site located at the given URL. # - # @param [String] - # The host-name to spider. + # @param [URI::HTTP, String] url + # The web-site to spider. # # @param [Hash] options # Additional options. See {Agent#initialize}. # # @yield [agent] @@ -174,23 +174,22 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # - def self.host(name,options={}) - self.new(options.merge(:host => name)) do |spider| - yield spider if block_given? + def self.site(url,options={},&block) + url = URI(url.to_s) unless url.kind_of?(URI) - spider.start_at("http://#{name}/") - end + agent = new(options.merge(:host => url.host),&block) + agent.start_at(url) end # - # Creates a new agent and spiders the web-site located at the given URL. + # Creates a new agent and spiders the given host. # - # @param [URI::HTTP, String] url - # The web-site to spider. + # @param [String] + # The host-name to spider. # # @param [Hash] options # Additional options. See {Agent#initialize}. # # @yield [agent] @@ -198,18 +197,12 @@ # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # - def self.site(url,options={}) - url = URI(url.to_s) - - return self.new(options.merge(:host => url.host)) do |spider| - yield spider if block_given? - - spider.start_at(url) - end + def self.host(name,options={},&block) + site(URI::HTTP.build(:host => name, :path => '/'),options,&block) end # # Clears the history of the agent. # @@ -232,11 +225,10 @@ # @yieldparam [Page] page # A page which has been visited. # def start_at(url,&block) enqueue(url) - return run(&block) end # # Start spidering until the queue becomes empty or the agent is @@ -259,11 +251,10 @@ rescue Actions::Action end end @running = false - @sessions.clear return self end # @@ -385,14 +376,14 @@ def failures=(new_failures) @failures.clear new_failures.each do |url| @failures << unless url.kind_of?(URI) - URI(url.to_s) - else - url - end + URI(url.to_s) + else + url + end end return @failures end @@ -469,11 +460,11 @@ link = url.to_s begin @every_url_blocks.each { |url_block| url_block.call(url) } - @urls_like_blocks.each do |pattern,url_blocks| + @every_url_like_blocks.each do |pattern,url_blocks| match = case pattern when Regexp link =~ pattern else (pattern == link) || (pattern == url) @@ -651,17 +642,16 @@ # @since 0.2.2 # def prepare_request(url,&block) host = url.host port = url.port + path = unless url.path.empty? + url.path + else + '/' + end - unless url.path.empty? - path = url.path - else - path = '/' - end - # append the URL query to the path path += "?#{url.query}" if url.query # set any additional HTTP headers headers = {} @@ -722,10 +712,10 @@ # # @return [Boolean] # Specifies whether the given URL should be visited. # def visit?(url) - !(visited?(url)) && + !visited?(url) && visit_scheme?(url.scheme) && visit_host?(url.host) && visit_port?(url.port) && visit_link?(url.to_s) && visit_url?(url) &&