lib/spidr/agent.rb in spidr-0.1.7 vs lib/spidr/agent.rb in spidr-0.1.8
- old
+ new
@@ -26,10 +26,13 @@
attr_reader :history
# List of unreachable URLs
attr_reader :failures
+ # Queue of URLs to visit
+ attr_reader :queue
+
#
# Creates a new Agent object with the given _options_ and _block_.
# If a _block_ is given, it will be passed the newly created
# Agent object.
#
@@ -80,10 +83,11 @@
@delay = (options[:delay] || 0)
@history = []
@failures = []
@queue = []
+ @paused = true
if options[:host]
visit_hosts_like(options[:host])
end
@@ -359,26 +363,74 @@
#
# Clear the history and start spidering at the specified _url_.
#
def start_at(url)
clear
- return run(url)
+ enqueue(url)
+
+ return continue!
end
#
- # Start spidering at the specified _url_.
+ # Start spidering until the queue becomes empty or the agent is
+ # paused.
#
- def run(url)
- enqueue(url)
-
- until @queue.empty?
+ def run
+ until (@queue.empty? || @paused == true)
visit_page(dequeue)
end
return self
end
+ #
+ # Continue spidering.
+ #
+ def continue!
+ @paused = false
+ return run
+ end
+
+ #
+ # Returns +true+ if the agent is still spidering, returns +false+
+ # otherwise.
+ #
+ def running?
+ @paused == false
+ end
+
+ #
+ # Returns +true+ if the agent is paused, returns +false+ otherwise.
+ #
+ def paused?
+ @paused == true
+ end
+
+ #
+ # Pauses the agent, causing spidering to temporarily stop.
+ #
+ def pause!
+ @paused = true
+ return self
+ end
+
+ #
+ # Sets the history of links that were previously visited to the
+ # specified _new_history_.
+ #
+ # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
+ #
+ def history=(new_history)
+ @history = new_history.map do |url|
+ unless url.kind_of?(URI)
+ URI(url.to_s)
+ else
+ url
+ end
+ end
+ end
+
alias visited_urls history
#
# Returns the +Array+ of visited URLs.
#
@@ -396,29 +448,27 @@
#
# Returns +true+ if the specified _url_ was visited, returns +false+
# otherwise.
#
def visited?(url)
- unless url.kind_of?(URI)
- url = URI(url)
- end
+ url = URI(url) unless url.kind_of?(URI)
return @history.include?(url)
end
#
# Returns +true+ if the specified _url_ was unable to be visited,
# returns +false+ otherwise.
#
def failed?(url)
- unless url.kind_of?(URI)
- url = URI(url)
- end
+ url = URI(url) unless url.kind_of?(URI)
return @failures.include?(url)
end
+ alias pending_urls queue
+
#
# Creates a new Page object from the specified _url_. If a _block_ is
# given, it will be passed the newly created Page object.
#
def get_page(url,&block)
@@ -452,13 +502,34 @@
failed(url)
return nil
end
end
- protected
+ #
+ # Returns the agent represented as a Hash containing the agents
+ # +history+ and +queue+ information.
+ #
+ def to_hash
+ {:history => @history, :queue => @queue}
+ end
#
+ # Sets the queue of links to visit to the specified _new_queue_.
+ #
+ # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
+ #
+ def queue=(new_queue)
+ @queue = new_queue.map do |url|
+ unless url.kind_of?(URI)
+ URI(url.to_s)
+ else
+ url
+ end
+ end
+ end
+
+ #
# Returns +true+ if the specified _url_ is queued for visiting, returns
# +false+ otherwise.
#
def queued?(url)
@queue.include?(url)
@@ -487,10 +558,12 @@
end
return false
end
+ protected
+
#
# Dequeues a URL that will later be visited.
#
def dequeue
@queue.shift
@@ -572,12 +645,10 @@
#
# Adds the specified _url_ to the failures list.
#
def failed(url)
- unless url.kind_of?(URI)
- url = URI(url.to_s)
- end
+ url = URI(url.to_s) unless url.kind_of?(URI)
@every_failed_url_blocks.each { |block| block.call(url) }
@failures << url
return true
end