lib/spidr/agent.rb in spidr-0.2.2 vs lib/spidr/agent.rb in spidr-0.2.3

- old
+ new

@@ -17,10 +17,16 @@ include Sanitizers include Filters include Events include Actions + # HTTP Host Header to use + attr_accessor :host_header + + # HTTP Host Headers to use for specific hosts + attr_reader :host_headers + # User-Agent to use attr_accessor :user_agent # HTTP Authentication credentials attr_accessor :authorized @@ -62,10 +68,16 @@ # The user to authenticate as with the proxy. # # @option :proxy [String] :password # The password to authenticate with. # + # @option options [String] :host_header + # The HTTP Host header to use with each request. + # + # @option options [Hash{String,Regexp => String}] :host_headers + # The HTTP Host headers to use for specific hosts. + # # @option options [String] :user_agent (Spidr.user_agent) # The User-Agent string to send with each requests. # # @option options [String] :referer # The Referer URL to send with each request. @@ -85,10 +97,17 @@ # # @yieldparam [Agent] agent # The newly created agent. # def initialize(options={},&block) + @host_header = options[:host_header] + @host_headers = {} + + if options[:host_headers] + @host_headers.merge!(options[:host_headers]) + end + @user_agent = (options[:user_agent] || Spidr.user_agent) @referer = options[:referer] @sessions = SessionCache.new(options[:proxy] || Spidr.proxy) @cookies = CookieJar.new @@ -471,11 +490,11 @@ # # @yieldparam [Page] page # The page for the response. # # @return [Page, nil] - # The page for the response, or +nil+ if the request failed. + # The page for the response, or `nil` if the request failed. # def get_page(url,&block) url = URI(url.to_s) prepare_request(url) do |session,path,headers| @@ -504,11 +523,11 @@ # # @yieldparam [Page] page # The page for the response. # # @return [Page, nil] - # The page for the response, or +nil+ if the request failed. + # The page for the response, or `nil` if the request failed. # # @since 0.2.2 # def post_page(url,post_data='',&block) url = URI(url.to_s) @@ -536,11 +555,11 @@ # # @yieldparam [Page] page # The page which was visited. # # @return [Page, nil] - # The page that was visited. If +nil+ is returned, either the request + # The page that was visited. If `nil` is returned, either the request # for the page failed, or the page was skipped. # def visit_page(url,&block) url = URI(url.to_s) unless url.kind_of?(URI) @@ -556,20 +575,33 @@ rescue Actions::SkipPage return nil rescue Actions::Action end - page.urls.each { |next_url| enqueue(next_url) } + page.urls.each do |next_url| + begin + @every_link_blocks.each do |link_block| + link_block.call(page.url,next_url) + end + rescue Actions::Paused => action + raise(action) + rescue Actions::SkipLink + next + rescue Actions::Action + end + + enqueue(next_url) + end end end # # Converts the agent into a Hash. # # @return [Hash] - # The agent represented as a Hash containing the +history+ and - # the +queue+ of the agent. + # The agent represented as a Hash containing the `history` and + # the `queue` of the agent. # def to_hash {:history => @history, :queue => @queue} end @@ -607,23 +639,35 @@ end # append the URL query to the path path += "?#{url.query}" if url.query - begin - sleep(@delay) if @delay > 0 + # set any additional HTTP headers + headers = {} - headers = {} - headers['User-Agent'] = @user_agent if @user_agent - headers['Referer'] = @referer if @referer - - if (authorization = @authorized.for_url(url)) - headers['Authorization'] = "Basic #{authorization}" + unless @host_headers.empty? + @host_headers.each do |name,header| + if host.match(name) + headers['Host'] = header + break + end end + end - if (header_cookies = @cookies.for_host(url.host)) - headers['Cookie'] = header_cookies - end + headers['Host'] ||= @host_header if @host_header + headers['User-Agent'] = @user_agent if @user_agent + headers['Referer'] = @referer if @referer + + if (authorization = @authorized.for_url(url)) + headers['Authorization'] = "Basic #{authorization}" + end + + if (header_cookies = @cookies.for_host(url.host)) + headers['Cookie'] = header_cookies + end + + begin + sleep(@delay) if @delay > 0 block.call(@sessions[url],path,headers) rescue SystemCallError, Timeout::Error, SocketError,