require 'spidr/filters' require 'spidr/events' require 'spidr/actions' require 'spidr/page' require 'spidr/spidr' require 'net/http' require 'set' module Spidr class Agent include Filters include Events include Actions # Proxy to use attr_accessor :proxy # User-Agent to use attr_accessor :user_agent # Referer to use attr_accessor :referer # Delay in between fetching pages attr_accessor :delay # History containing visited URLs attr_reader :history # List of unreachable URLs attr_reader :failures # Queue of URLs to visit attr_reader :queue # # Creates a new Agent object. # # @param [Hash] options # Additional options # # @option options [Hash] :proxy (Spidr.proxy) # The proxy information to use. # # @option :proxy [String] :host # The host the proxy is running on. # # @option :proxy [Integer] :port # The port the proxy is running on. # # @option :proxy [String] :user # The user to authenticate as with the proxy. # # @option :proxy [String] :password # The password to authenticate with. # # @option options [String] :user_agent (Spidr.user_agent) # The User-Agent string to send with each requests. # # @option options [String] :referer # The Referer URL to send with each request. # # @option options [Integer] :delay (0) # The number of seconds to pause between each request. # # @option options [Set, Array] :queue # The initial queue of URLs to visit. # # @option options [Set, Array] :history # The initial list of visited URLs. # # @yield [agent] # If a block is given, it will be passed the newly created agent # for further configuration. # # @yieldparam [Agent] agent # The newly created agent. # def initialize(options={},&block) @proxy = (options[:proxy] || Spidr.proxy) @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] @running = false @delay = (options[:delay] || 0) @history = Set[] @failures = Set[] @queue = [] @sessions = {} super(options) block.call(self) if block end # # Creates a new agent and begin spidering at the given URL. # # @param [URI::HTTP, String] url # The URL to start spidering at. # # @param [Hash] options # Additional options. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # def self.start_at(url,options={},&block) self.new(options) do |spider| block.call(spider) if block spider.start_at(url) end end # # Creates a new agent and spiders the given host. # # @param [String] # The host-name to spider. # # @param [Hash] options # Additional options. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # def self.host(name,options={},&block) self.new(options.merge(:host => name)) do |spider| block.call(spider) if block spider.start_at("http://#{name}/") end end # # Creates a new agent and spiders the web-site located at the given URL. # # @param [URI::HTTP, String] url # The web-site to spider. # # @param [Hash] options # Additional options. See {Agent#initialize}. # # @yield [agent] # If a block is given, it will be passed the newly created agent # before it begins spidering. # # @yieldparam [Agent] agent # The newly created agent. # def self.site(url,options={},&block) url = URI(url.to_s) return self.new(options.merge(:host => url.host)) do |spider| block.call(spider) if block spider.start_at(url) end end # # Clears the history of the agent. # def clear @queue.clear @history.clear @failures.clear return self end # # Start spidering at a given URL. # # @param [URI::HTTP, String] url # The URL to start spidering at. # # @yield [page] # If a block is given, it will be passed every page visited. # # @yieldparam [Page] page # A page which has been visited. # def start_at(url,&block) enqueue(url) return run(&block) end # # Start spidering until the queue becomes empty or the agent is # paused. # # @yield [page] # If a block is given, it will be passed every page visited. # # @yieldparam [Page] page # A page which has been visited. # def run(&block) @running = true until (@queue.empty? || paused?) begin visit_page(dequeue,&block) rescue Actions::Paused return self rescue Actions::Action end end @running = false @sessions.each_value do |sess| begin sess.finish rescue IOError nil end end @sessions.clear return self end # # Determines if the agent is running. # # @return [Boolean] # Specifies whether the agent is running or stopped. # def running? @running == true end # # Sets the history of URLs that were previously visited. # # @param [#each] new_history # A list of URLs to populate the history with. # # @return [Set] # The history of the agent. # # @example # agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/'] # def history=(new_history) @history.clear new_history.each do |url| @history << unless url.kind_of?(URI) URI(url.to_s) else url end end return @history end alias visited_urls history # # Specifies the links which have been visited. # # @return [Array] # The links which have been visited. # def visited_links @history.map { |url| url.to_s } end # # Specifies all hosts that were visited. # # @return [Array] # The hosts which have been visited. # def visited_hosts visited_urls.map { |uri| uri.host }.uniq end # # Determines whether a URL was visited or not. # # @param [URI::HTTP, String] url # The URL to search for. # # @return [Boolean] # Specifies whether a URL was visited. # def visited?(url) url = URI(url.to_s) unless url.kind_of?(URI) return @history.include?(url) end # # Sets the list of failed URLs. # # @param [#each] # The new list of failed URLs. # # @return [Array] # The list of failed URLs. # # @example # agent.failures = ['http://localhost/'] # def failures=(new_failures) @failures.clear new_failures.each do |url| @failures << unless url.kind_of?(URI) URI(url.to_s) else url end end return @failures end # # Determines whether a given URL could not be visited. # # @param [URI::HTTP, String] url # The URL to check for failures. # # @return [Boolean] # Specifies whether the given URL was unable to be visited. # def failed?(url) url = URI(url.to_s) unless url.kind_of?(URI) return @failures.include?(url) end alias pending_urls queue # # Sets the queue of URLs to visit. # # @param [#each] # The new list of URLs to visit. # # @return [Array] # The list of URLs to visit. # # @example # agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/'] # def queue=(new_queue) @queue.clear new_queue.each do |url| @queue << unless url.kind_of?(URI) URI(url.to_s) else url end end return @queue end # # Determines whether a given URL has been enqueued. # # @param [URI::HTTP] url # The URL to search for in the queue. # # @return [Boolean] # Specifies whether the given URL has been queued for visiting. # def queued?(url) @queue.include?(url) end # # Enqueues a given URL for visiting, only if it passes all of the # agent's rules for visiting a given URL. # # @param [URI::HTTP, String] url # The URL to enqueue for visiting. # # @return [Boolean] # Specifies whether the URL was enqueued, or ignored. # def enqueue(url) link = url.to_s url = URI(link) unless url.kind_of?(URI) if (!(queued?(url)) && visit?(url)) begin @every_url_blocks.each { |block| block.call(url) } @urls_like_blocks.each do |pattern,blocks| if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url) blocks.each { |url_block| url_block.call(url) } end end rescue Actions::Paused => action raise(action) rescue Actions::SkipLink return false rescue Actions::Action end @queue << url return true end return false end # # Requests and creates a new Page object from a given URL. # # @param [URI::HTTP] url # The URL to request. # # @yield [page] # If a block is given, it will be passed the page that represents the # response. # # @yieldparam [Page] page # The page for the response. # # @return [Page, nil] # The page for the response, or +nil+ if the request failed. # def get_page(url,&block) url = URI(url.to_s) unless url.kind_of?(URI) host = url.host port = url.port unless url.path.empty? path = url.path else path = '/' end # append the URL query to the path path += "?#{url.query}" if url.query begin sleep(@delay) if @delay > 0 get_session(url.scheme,host,port) do |sess| headers = {} headers['User-Agent'] = @user_agent if @user_agent headers['Referer'] = @referer if @referer new_page = Page.new(url,sess.get(path,headers)) block.call(new_page) if block return new_page end rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError failed(url) kill_session(url.scheme,host,port) return nil end end # # Visits a given URL, and enqueus the links recovered from the URL # to be visited later. # # @param [URI::HTTP, String] url # The URL to visit. # # @yield [page] # If a block is given, it will be passed the page which was visited. # # @yieldparam [Page] page # The page which was visited. # # @return [Page, nil] # The page that was visited. If +nil+ is returned, either the request # for the page failed, or the page was skipped. # def visit_page(url,&block) url = URI(url.to_s) unless url.kind_of?(URI) get_page(url) do |page| @history << page.url begin @every_page_blocks.each { |page_block| page_block.call(page) } block.call(page) if block rescue Actions::Paused => action raise(action) rescue Actions::SkipPage return nil rescue Actions::Action end page.urls.each { |next_url| enqueue(next_url) } end end # # Converts the agent into a Hash. # # @return [Hash] # The agent represented as a Hash containing the +history+ and # the +queue+ of the agent. # def to_hash {:history => @history, :queue => @queue} end protected # # Provides an active HTTP session for the given scheme, host # and port. # # @param [String] scheme # The scheme of the URL, which will be requested later. # # @param [String] host # The host that the session is needed with. # # @param [Integer] port # The port that the session is needed for. # # @yield [session] # If a block is given, it will be passed the active HTTP session. # # @yieldparam [Net::HTTP] session # The active HTTP session object. # def get_session(scheme,host,port,&block) key = [scheme,host,port] unless @sessions[key] session = Net::HTTP::Proxy( @proxy[:host], @proxy[:port], @proxy[:user], @proxy[:password] ).new(host,port) if scheme == 'https' session.use_ssl = true session.verify_mode = OpenSSL::SSL::VERIFY_NONE end @sessions[key] = session end session = @sessions[key] block.call(session) if block return session end # # Destroys an HTTP session for the given scheme, host and port. # # @param [String] scheme # The scheme of the URL, which was requested through the session. # # @param [String] host # The host that the session was connected with. # # @param [Integer] port # The port that the session was connected to. # def kill_session(scheme,host,port,&block) key = [scheme,host,port] sess = @sessions[key] begin sess.finish rescue IOError nil end @sessions.delete(key) block.call if block return nil end # # Dequeues a URL that will later be visited. # # @return [URI::HTTP] # The URL that was at the front of the queue. # def dequeue @queue.shift end # # Determines if a given URL should be visited. # # @param [URI::HTTP] url # The URL in question. # # @return [Boolean] # Specifies whether the given URL should be visited. # def visit?(url) (!(visited?(url)) && visit_scheme?(url.scheme) && visit_host?(url.host) && visit_port?(url.port) && visit_link?(url.to_s) && visit_ext?(url.path)) end # # Adds a given URL to the failures list. # # @param [URI::HTTP] url # The URL to add to the failures list. # def failed(url) @every_failed_url_blocks.each { |block| block.call(url) } @failures << url return true end end end