lib/spidr/agent.rb in spidr-0.2.1 vs lib/spidr/agent.rb in spidr-0.2.2

- old
+ new

@@ -1,27 +1,32 @@ +require 'spidr/sanitizers' require 'spidr/filters' require 'spidr/events' require 'spidr/actions' require 'spidr/page' +require 'spidr/session_cache' +require 'spidr/cookie_jar' +require 'spidr/auth_store' require 'spidr/spidr' require 'net/http' require 'set' module Spidr class Agent + include Sanitizers include Filters include Events include Actions - # Proxy to use - attr_accessor :proxy - # User-Agent to use attr_accessor :user_agent + # HTTP Authentication credentials + attr_accessor :authorized + # Referer to use attr_accessor :referer # Delay in between fetching pages attr_accessor :delay @@ -33,10 +38,13 @@ attr_reader :failures # Queue of URLs to visit attr_reader :queue + # Cached cookies + attr_reader :cookies + # # Creates a new Agent object. # # @param [Hash] options # Additional options @@ -77,22 +85,23 @@ # # @yieldparam [Agent] agent # The newly created agent. # def initialize(options={},&block) - @proxy = (options[:proxy] || Spidr.proxy) @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] + @sessions = SessionCache.new(options[:proxy] || Spidr.proxy) + @cookies = CookieJar.new + @authorized = AuthStore.new + @running = false @delay = (options[:delay] || 0) @history = Set[] @failures = Set[] @queue = [] - @sessions = {} - super(options) block.call(self) if block end @@ -220,18 +229,10 @@ end end @running = false - @sessions.each_value do |sess| - begin - sess.finish - rescue IOError - nil - end - end - @sessions.clear return self end # @@ -243,10 +244,41 @@ def running? @running == true end # + # The proxy information the agent uses. + # + # @return [Hash] + # The proxy information. + # + # @see SessionCache#proxy + # + # @since 0.2.2 + # + def proxy + @sessions.proxy + end + + # + # Sets the proxy information that the agent uses. + # + # @param [Hash] new_proxy + # The new proxy information. + # + # @return [Hash] + # The new proxy information. + # + # @see SessionCache#proxy= + # + # @since 0.2.2 + # + def proxy=(new_proxy) + @sessions.proxy = new_proxy + end + + # # Sets the history of URLs that were previously visited. # # @param [#each] new_history # A list of URLs to populate the history with. # @@ -398,14 +430,15 @@ # # @return [Boolean] # Specifies whether the URL was enqueued, or ignored. # def enqueue(url) - link = url.to_s - url = URI(link) unless url.kind_of?(URI) + url = sanitize_url(url) if (!(queued?(url)) && visit?(url)) + link = url.to_s + begin @every_url_blocks.each { |block| block.call(url) } @urls_like_blocks.each do |pattern,blocks| if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url) @@ -441,41 +474,55 @@ # # @return [Page, nil] # The page for the response, or +nil+ if the request failed. # def get_page(url,&block) - url = URI(url.to_s) unless url.kind_of?(URI) + url = URI(url.to_s) - host = url.host - port = url.port + prepare_request(url) do |session,path,headers| + new_page = Page.new(url,session.get(path,headers)) - unless url.path.empty? - path = url.path - else - path = '/' + # save any new cookies + @cookies.from_page(new_page) + + block.call(new_page) if block + return new_page end + end - # append the URL query to the path - path += "?#{url.query}" if url.query + # + # Posts supplied form data and creates a new Page object from a given URL. + # + # @param [URI::HTTP] url + # The URL to request. + # + # @param [String] post_data + # Form option data. + # + # @yield [page] + # If a block is given, it will be passed the page that represents the + # response. + # + # @yieldparam [Page] page + # The page for the response. + # + # @return [Page, nil] + # The page for the response, or +nil+ if the request failed. + # + # @since 0.2.2 + # + def post_page(url,post_data='',&block) + url = URI(url.to_s) - begin - sleep(@delay) if @delay > 0 + prepare_request(url) do |session,path,headers| + new_page = Page.new(url,session.post(path,post_data,headers)) - get_session(url.scheme,host,port) do |sess| - headers = {} - headers['User-Agent'] = @user_agent if @user_agent - headers['Referer'] = @referer if @referer + # save any new cookies + @cookies.from_page(new_page) - new_page = Page.new(url,sess.get(path,headers)) - - block.call(new_page) if block - return new_page - end - rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError - failed(url) - kill_session(url.scheme,host,port) - return nil + block.call(new_page) if block + return new_page end end # # Visits a given URL, and enqueus the links recovered from the URL @@ -527,77 +574,70 @@ end protected # - # Provides an active HTTP session for the given scheme, host - # and port. + # Normalizes the request path and grabs a session to handle page + # get and post requests. # - # @param [String] scheme - # The scheme of the URL, which will be requested later. + # @param [URI::HTTP] url + # The URL to request. # - # @param [String] host - # The host that the session is needed with. + # @yield [request] + # A block whose purpose is to make a page request. # - # @param [Integer] port - # The port that the session is needed for. + # @yieldparam [Net::HTTP] session + # An HTTP session object. # - # @yield [session] - # If a block is given, it will be passed the active HTTP session. + # @yieldparam [String] path + # Normalized URL string. # - # @yieldparam [Net::HTTP] session - # The active HTTP session object. + # @yieldparam [Hash] headers + # A Hash of request header options. # - def get_session(scheme,host,port,&block) - key = [scheme,host,port] + # @since 0.2.2 + # + def prepare_request(url,&block) + host = url.host + port = url.port - unless @sessions[key] - session = Net::HTTP::Proxy( - @proxy[:host], - @proxy[:port], - @proxy[:user], - @proxy[:password] - ).new(host,port) + unless url.path.empty? + path = url.path + else + path = '/' + end - if scheme == 'https' - session.use_ssl = true - session.verify_mode = OpenSSL::SSL::VERIFY_NONE + # append the URL query to the path + path += "?#{url.query}" if url.query + + begin + sleep(@delay) if @delay > 0 + + headers = {} + headers['User-Agent'] = @user_agent if @user_agent + headers['Referer'] = @referer if @referer + + if (authorization = @authorized.for_url(url)) + headers['Authorization'] = "Basic #{authorization}" end - @sessions[key] = session - end + if (header_cookies = @cookies.for_host(url.host)) + headers['Cookie'] = header_cookies + end - session = @sessions[key] - block.call(session) if block - return session - end + block.call(@sessions[url],path,headers) + rescue SystemCallError, + Timeout::Error, + SocketError, + Net::HTTPBadResponse, + IOError - # - # Destroys an HTTP session for the given scheme, host and port. - # - # @param [String] scheme - # The scheme of the URL, which was requested through the session. - # - # @param [String] host - # The host that the session was connected with. - # - # @param [Integer] port - # The port that the session was connected to. - # - def kill_session(scheme,host,port,&block) - key = [scheme,host,port] - sess = @sessions[key] + @sessions.kill!(url) - begin - sess.finish - rescue IOError - nil + failed(url) + return nil end - - @sessions.delete(key) - block.call if block - return nil end # # Dequeues a URL that will later be visited. # @@ -631,11 +671,11 @@ # # @param [URI::HTTP] url # The URL to add to the failures list. # def failed(url) - @every_failed_url_blocks.each { |block| block.call(url) } @failures << url + @every_failed_url_blocks.each { |block| block.call(url) } return true end end end