lib/spidr/agent.rb in spidr-0.1.7 vs lib/spidr/agent.rb in spidr-0.1.8

- old
+ new

@@ -26,10 +26,13 @@ attr_reader :history # List of unreachable URLs attr_reader :failures + # Queue of URLs to visit + attr_reader :queue + # # Creates a new Agent object with the given _options_ and _block_. # If a _block_ is given, it will be passed the newly created # Agent object. # @@ -80,10 +83,11 @@ @delay = (options[:delay] || 0) @history = [] @failures = [] @queue = [] + @paused = true if options[:host] visit_hosts_like(options[:host]) end @@ -359,26 +363,74 @@ # # Clear the history and start spidering at the specified _url_. # def start_at(url) clear - return run(url) + enqueue(url) + + return continue! end # - # Start spidering at the specified _url_. + # Start spidering until the queue becomes empty or the agent is + # paused. # - def run(url) - enqueue(url) - - until @queue.empty? + def run + until (@queue.empty? || @paused == true) visit_page(dequeue) end return self end + # + # Continue spidering. + # + def continue! + @paused = false + return run + end + + # + # Returns +true+ if the agent is still spidering, returns +false+ + # otherwise. + # + def running? + @paused == false + end + + # + # Returns +true+ if the agent is paused, returns +false+ otherwise. + # + def paused? + @paused == true + end + + # + # Pauses the agent, causing spidering to temporarily stop. + # + def pause! + @paused = true + return self + end + + # + # Sets the history of links that were previously visited to the + # specified _new_history_. + # + # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] + # + def history=(new_history) + @history = new_history.map do |url| + unless url.kind_of?(URI) + URI(url.to_s) + else + url + end + end + end + alias visited_urls history # # Returns the +Array+ of visited URLs. # @@ -396,29 +448,27 @@ # # Returns +true+ if the specified _url_ was visited, returns +false+ # otherwise. # def visited?(url) - unless url.kind_of?(URI) - url = URI(url) - end + url = URI(url) unless url.kind_of?(URI) return @history.include?(url) end # # Returns +true+ if the specified _url_ was unable to be visited, # returns +false+ otherwise. # def failed?(url) - unless url.kind_of?(URI) - url = URI(url) - end + url = URI(url) unless url.kind_of?(URI) return @failures.include?(url) end + alias pending_urls queue + # # Creates a new Page object from the specified _url_. If a _block_ is # given, it will be passed the newly created Page object. # def get_page(url,&block) @@ -452,13 +502,34 @@ failed(url) return nil end end - protected + # + # Returns the agent represented as a Hash containing the agents + # +history+ and +queue+ information. + # + def to_hash + {:history => @history, :queue => @queue} + end # + # Sets the queue of links to visit to the specified _new_queue_. + # + # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/'] + # + def queue=(new_queue) + @queue = new_queue.map do |url| + unless url.kind_of?(URI) + URI(url.to_s) + else + url + end + end + end + + # # Returns +true+ if the specified _url_ is queued for visiting, returns # +false+ otherwise. # def queued?(url) @queue.include?(url) @@ -487,10 +558,12 @@ end return false end + protected + # # Dequeues a URL that will later be visited. # def dequeue @queue.shift @@ -572,12 +645,10 @@ # # Adds the specified _url_ to the failures list. # def failed(url) - unless url.kind_of?(URI) - url = URI(url.to_s) - end + url = URI(url.to_s) unless url.kind_of?(URI) @every_failed_url_blocks.each { |block| block.call(url) } @failures << url return true end