lib/spidr/agent.rb in spidr-0.2.2 vs lib/spidr/agent.rb in spidr-0.2.3
- old
+ new
@@ -17,10 +17,16 @@
include Sanitizers
include Filters
include Events
include Actions
+ # HTTP Host Header to use
+ attr_accessor :host_header
+
+ # HTTP Host Headers to use for specific hosts
+ attr_reader :host_headers
+
# User-Agent to use
attr_accessor :user_agent
# HTTP Authentication credentials
attr_accessor :authorized
@@ -62,10 +68,16 @@
# The user to authenticate as with the proxy.
#
# @option :proxy [String] :password
# The password to authenticate with.
#
+ # @option options [String] :host_header
+ # The HTTP Host header to use with each request.
+ #
+ # @option options [Hash{String,Regexp => String}] :host_headers
+ # The HTTP Host headers to use for specific hosts.
+ #
# @option options [String] :user_agent (Spidr.user_agent)
# The User-Agent string to send with each requests.
#
# @option options [String] :referer
# The Referer URL to send with each request.
@@ -85,10 +97,17 @@
#
# @yieldparam [Agent] agent
# The newly created agent.
#
def initialize(options={},&block)
+ @host_header = options[:host_header]
+ @host_headers = {}
+
+ if options[:host_headers]
+ @host_headers.merge!(options[:host_headers])
+ end
+
@user_agent = (options[:user_agent] || Spidr.user_agent)
@referer = options[:referer]
@sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
@cookies = CookieJar.new
@@ -471,11 +490,11 @@
#
# @yieldparam [Page] page
# The page for the response.
#
# @return [Page, nil]
- # The page for the response, or +nil+ if the request failed.
+ # The page for the response, or `nil` if the request failed.
#
def get_page(url,&block)
url = URI(url.to_s)
prepare_request(url) do |session,path,headers|
@@ -504,11 +523,11 @@
#
# @yieldparam [Page] page
# The page for the response.
#
# @return [Page, nil]
- # The page for the response, or +nil+ if the request failed.
+ # The page for the response, or `nil` if the request failed.
#
# @since 0.2.2
#
def post_page(url,post_data='',&block)
url = URI(url.to_s)
@@ -536,11 +555,11 @@
#
# @yieldparam [Page] page
# The page which was visited.
#
# @return [Page, nil]
- # The page that was visited. If +nil+ is returned, either the request
+ # The page that was visited. If `nil` is returned, either the request
# for the page failed, or the page was skipped.
#
def visit_page(url,&block)
url = URI(url.to_s) unless url.kind_of?(URI)
@@ -556,20 +575,33 @@
rescue Actions::SkipPage
return nil
rescue Actions::Action
end
- page.urls.each { |next_url| enqueue(next_url) }
+ page.urls.each do |next_url|
+ begin
+ @every_link_blocks.each do |link_block|
+ link_block.call(page.url,next_url)
+ end
+ rescue Actions::Paused => action
+ raise(action)
+ rescue Actions::SkipLink
+ next
+ rescue Actions::Action
+ end
+
+ enqueue(next_url)
+ end
end
end
#
# Converts the agent into a Hash.
#
# @return [Hash]
- # The agent represented as a Hash containing the +history+ and
- # the +queue+ of the agent.
+ # The agent represented as a Hash containing the `history` and
+ # the `queue` of the agent.
#
def to_hash
{:history => @history, :queue => @queue}
end
@@ -607,23 +639,35 @@
end
# append the URL query to the path
path += "?#{url.query}" if url.query
- begin
- sleep(@delay) if @delay > 0
+ # set any additional HTTP headers
+ headers = {}
- headers = {}
- headers['User-Agent'] = @user_agent if @user_agent
- headers['Referer'] = @referer if @referer
-
- if (authorization = @authorized.for_url(url))
- headers['Authorization'] = "Basic #{authorization}"
+ unless @host_headers.empty?
+ @host_headers.each do |name,header|
+ if host.match(name)
+ headers['Host'] = header
+ break
+ end
end
+ end
- if (header_cookies = @cookies.for_host(url.host))
- headers['Cookie'] = header_cookies
- end
+ headers['Host'] ||= @host_header if @host_header
+ headers['User-Agent'] = @user_agent if @user_agent
+ headers['Referer'] = @referer if @referer
+
+ if (authorization = @authorized.for_url(url))
+ headers['Authorization'] = "Basic #{authorization}"
+ end
+
+ if (header_cookies = @cookies.for_host(url.host))
+ headers['Cookie'] = header_cookies
+ end
+
+ begin
+ sleep(@delay) if @delay > 0
block.call(@sessions[url],path,headers)
rescue SystemCallError,
Timeout::Error,
SocketError,