lib/spidr/agent.rb in spidr-0.2.1 vs lib/spidr/agent.rb in spidr-0.2.2
- old
+ new
@@ -1,27 +1,32 @@
+require 'spidr/sanitizers'
require 'spidr/filters'
require 'spidr/events'
require 'spidr/actions'
require 'spidr/page'
+require 'spidr/session_cache'
+require 'spidr/cookie_jar'
+require 'spidr/auth_store'
require 'spidr/spidr'
require 'net/http'
require 'set'
module Spidr
class Agent
+ include Sanitizers
include Filters
include Events
include Actions
- # Proxy to use
- attr_accessor :proxy
-
# User-Agent to use
attr_accessor :user_agent
+ # HTTP Authentication credentials
+ attr_accessor :authorized
+
# Referer to use
attr_accessor :referer
# Delay in between fetching pages
attr_accessor :delay
@@ -33,10 +38,13 @@
attr_reader :failures
# Queue of URLs to visit
attr_reader :queue
+ # Cached cookies
+ attr_reader :cookies
+
#
# Creates a new Agent object.
#
# @param [Hash] options
# Additional options
@@ -77,22 +85,23 @@
#
# @yieldparam [Agent] agent
# The newly created agent.
#
def initialize(options={},&block)
- @proxy = (options[:proxy] || Spidr.proxy)
@user_agent = (options[:user_agent] || Spidr.user_agent)
@referer = options[:referer]
+ @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
+ @cookies = CookieJar.new
+ @authorized = AuthStore.new
+
@running = false
@delay = (options[:delay] || 0)
@history = Set[]
@failures = Set[]
@queue = []
- @sessions = {}
-
super(options)
block.call(self) if block
end
@@ -220,18 +229,10 @@
end
end
@running = false
- @sessions.each_value do |sess|
- begin
- sess.finish
- rescue IOError
- nil
- end
- end
-
@sessions.clear
return self
end
#
@@ -243,10 +244,41 @@
def running?
@running == true
end
#
+ # The proxy information the agent uses.
+ #
+ # @return [Hash]
+ # The proxy information.
+ #
+ # @see SessionCache#proxy
+ #
+ # @since 0.2.2
+ #
+ def proxy
+ @sessions.proxy
+ end
+
+ #
+ # Sets the proxy information that the agent uses.
+ #
+ # @param [Hash] new_proxy
+ # The new proxy information.
+ #
+ # @return [Hash]
+ # The new proxy information.
+ #
+ # @see SessionCache#proxy=
+ #
+ # @since 0.2.2
+ #
+ def proxy=(new_proxy)
+ @sessions.proxy = new_proxy
+ end
+
+ #
# Sets the history of URLs that were previously visited.
#
# @param [#each] new_history
# A list of URLs to populate the history with.
#
@@ -398,14 +430,15 @@
#
# @return [Boolean]
# Specifies whether the URL was enqueued, or ignored.
#
def enqueue(url)
- link = url.to_s
- url = URI(link) unless url.kind_of?(URI)
+ url = sanitize_url(url)
if (!(queued?(url)) && visit?(url))
+ link = url.to_s
+
begin
@every_url_blocks.each { |block| block.call(url) }
@urls_like_blocks.each do |pattern,blocks|
if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
@@ -441,41 +474,55 @@
#
# @return [Page, nil]
# The page for the response, or +nil+ if the request failed.
#
def get_page(url,&block)
- url = URI(url.to_s) unless url.kind_of?(URI)
+ url = URI(url.to_s)
- host = url.host
- port = url.port
+ prepare_request(url) do |session,path,headers|
+ new_page = Page.new(url,session.get(path,headers))
- unless url.path.empty?
- path = url.path
- else
- path = '/'
+ # save any new cookies
+ @cookies.from_page(new_page)
+
+ block.call(new_page) if block
+ return new_page
end
+ end
- # append the URL query to the path
- path += "?#{url.query}" if url.query
+ #
+ # Posts supplied form data and creates a new Page object from a given URL.
+ #
+ # @param [URI::HTTP] url
+ # The URL to request.
+ #
+ # @param [String] post_data
+ # Form option data.
+ #
+ # @yield [page]
+ # If a block is given, it will be passed the page that represents the
+ # response.
+ #
+ # @yieldparam [Page] page
+ # The page for the response.
+ #
+ # @return [Page, nil]
+ # The page for the response, or +nil+ if the request failed.
+ #
+ # @since 0.2.2
+ #
+ def post_page(url,post_data='',&block)
+ url = URI(url.to_s)
- begin
- sleep(@delay) if @delay > 0
+ prepare_request(url) do |session,path,headers|
+ new_page = Page.new(url,session.post(path,post_data,headers))
- get_session(url.scheme,host,port) do |sess|
- headers = {}
- headers['User-Agent'] = @user_agent if @user_agent
- headers['Referer'] = @referer if @referer
+ # save any new cookies
+ @cookies.from_page(new_page)
- new_page = Page.new(url,sess.get(path,headers))
-
- block.call(new_page) if block
- return new_page
- end
- rescue SystemCallError, Timeout::Error, Net::HTTPBadResponse, IOError
- failed(url)
- kill_session(url.scheme,host,port)
- return nil
+ block.call(new_page) if block
+ return new_page
end
end
#
# Visits a given URL, and enqueus the links recovered from the URL
@@ -527,77 +574,70 @@
end
protected
#
- # Provides an active HTTP session for the given scheme, host
- # and port.
+ # Normalizes the request path and grabs a session to handle page
+ # get and post requests.
#
- # @param [String] scheme
- # The scheme of the URL, which will be requested later.
+ # @param [URI::HTTP] url
+ # The URL to request.
#
- # @param [String] host
- # The host that the session is needed with.
+ # @yield [request]
+ # A block whose purpose is to make a page request.
#
- # @param [Integer] port
- # The port that the session is needed for.
+ # @yieldparam [Net::HTTP] session
+ # An HTTP session object.
#
- # @yield [session]
- # If a block is given, it will be passed the active HTTP session.
+ # @yieldparam [String] path
+ # Normalized URL string.
#
- # @yieldparam [Net::HTTP] session
- # The active HTTP session object.
+ # @yieldparam [Hash] headers
+ # A Hash of request header options.
#
- def get_session(scheme,host,port,&block)
- key = [scheme,host,port]
+ # @since 0.2.2
+ #
+ def prepare_request(url,&block)
+ host = url.host
+ port = url.port
- unless @sessions[key]
- session = Net::HTTP::Proxy(
- @proxy[:host],
- @proxy[:port],
- @proxy[:user],
- @proxy[:password]
- ).new(host,port)
+ unless url.path.empty?
+ path = url.path
+ else
+ path = '/'
+ end
- if scheme == 'https'
- session.use_ssl = true
- session.verify_mode = OpenSSL::SSL::VERIFY_NONE
+ # append the URL query to the path
+ path += "?#{url.query}" if url.query
+
+ begin
+ sleep(@delay) if @delay > 0
+
+ headers = {}
+ headers['User-Agent'] = @user_agent if @user_agent
+ headers['Referer'] = @referer if @referer
+
+ if (authorization = @authorized.for_url(url))
+ headers['Authorization'] = "Basic #{authorization}"
end
- @sessions[key] = session
- end
+ if (header_cookies = @cookies.for_host(url.host))
+ headers['Cookie'] = header_cookies
+ end
- session = @sessions[key]
- block.call(session) if block
- return session
- end
+ block.call(@sessions[url],path,headers)
+ rescue SystemCallError,
+ Timeout::Error,
+ SocketError,
+ Net::HTTPBadResponse,
+ IOError
- #
- # Destroys an HTTP session for the given scheme, host and port.
- #
- # @param [String] scheme
- # The scheme of the URL, which was requested through the session.
- #
- # @param [String] host
- # The host that the session was connected with.
- #
- # @param [Integer] port
- # The port that the session was connected to.
- #
- def kill_session(scheme,host,port,&block)
- key = [scheme,host,port]
- sess = @sessions[key]
+ @sessions.kill!(url)
- begin
- sess.finish
- rescue IOError
- nil
+ failed(url)
+ return nil
end
-
- @sessions.delete(key)
- block.call if block
- return nil
end
#
# Dequeues a URL that will later be visited.
#
@@ -631,11 +671,11 @@
#
# @param [URI::HTTP] url
# The URL to add to the failures list.
#
def failed(url)
- @every_failed_url_blocks.each { |block| block.call(url) }
@failures << url
+ @every_failed_url_blocks.each { |block| block.call(url) }
return true
end
end
end