lib/mechanize.rb in mechanize-2.0.pre.2 vs lib/mechanize.rb in mechanize-2.0

- old
+ new

@@ -41,10 +41,11 @@ ruby_version = if RUBY_PATCHLEVEL >= 0 then "#{RUBY_VERSION}p#{RUBY_PATCHLEVEL}" else "#{RUBY_VERSION}dev#{RUBY_REVISION}" end + ## # User Agent aliases AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', @@ -59,104 +60,214 @@ 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', 'Mechanize' => "Mechanize/#{VERSION} Ruby/#{ruby_version} (http://github.com/tenderlove/mechanize/)" } # A Mechanize::CookieJar which stores cookies - attr_accessor :cookie_jar + def cookie_jar + @agent.cookie_jar + end + + def cookie_jar= cookie_jar + @agent.cookie_jar = cookie_jar + end + # Length of time to wait until a connection is opened in seconds - attr_accessor :open_timeout + def open_timeout + @agent.open_timeout + end + def open_timeout= open_timeout + @agent.open_timeout = open_timeout + end + # Length of time to attempt to read data from the server - attr_accessor :read_timeout + def read_timeout + @agent.read_timeout + end + def read_timeout= read_timeout + @agent.read_timeout = read_timeout + end + # The identification string for the client initiating a web request - attr_reader :user_agent + def user_agent + @agent.user_agent + end # The value of watch_for_set is passed to pluggable parsers for retrieved # content attr_accessor :watch_for_set # Path to an OpenSSL server certificate file - attr_accessor :ca_file + def ca_file + @agent.ca_file + end + def ca_file= ca_file + @agent.ca_file = ca_file + end + + def certificate + @agent.certificate + end + # An OpenSSL private key or the path to a private key - attr_accessor :key + def key + @agent.key + end + def key= key + @agent.key = key + end + # An OpenSSL client certificate or the path to a certificate file. - attr_accessor :cert + def cert + @agent.cert + end + def cert= cert + @agent.cert = cert + end + # OpenSSL key password - attr_accessor :pass + def pass + @agent.pass + end - # Controls how this agent deals with redirects. If it is set to - # true or :all, all 3xx redirects are automatically followed. This - # is the default behavior. If it is :permanent, only 301 (Moved - # Permanently) redirects are followed. If it is a false value, no - # redirects are followed. - attr_accessor :redirect_ok + def pass= pass + @agent.pass = pass + end - # Says this agent should consult the site's robots.txt for each access. - attr_reader :robots + # Controls how this agent deals with redirects. The following values are + # allowed: + # + # :all, true:: All 3xx redirects are followed (default) + # :permanent:: Only 301 Moved Permanantly redirects are followed + # false:: No redirects are followed - def robots=(value) - require 'webrobots' if value - @webrobots = nil if value != @robots - @robots = value + def redirect_ok + @agent.redirect_ok end + def redirect_ok= follow + @agent.redirect_ok = follow + end + + def gzip_enabled + @agent.gzip_enabled + end + # Disables HTTP/1.1 gzip compression (enabled by default) - attr_accessor :gzip_enabled + def gzip_enabled=enabled + @agent.gzip_enabled = enabled + end # HTTP/1.0 keep-alive time - attr_accessor :keep_alive_time + def keep_alive_time + @agent.keep_alive_time + end + def keep_alive_time= keep_alive_time + @agent.keep_alive_time = keep_alive_time + end + # HTTP/1.1 keep-alives are always active. This does nothing. attr_accessor :keep_alive + def conditional_requests + @agent.conditional_requests + end + # Disables If-Modified-Since conditional requests (enabled by default) - attr_accessor :conditional_requests + def conditional_requests= enabled + @agent.conditional_requests = enabled + end - # Follow HTML meta refresh - attr_accessor :follow_meta_refresh + # Follow HTML meta refresh. If set to +:anywhere+ meta refresh tags outside + # of the head element will be followed. + def follow_meta_refresh + @agent.follow_meta_refresh + end + def follow_meta_refresh= follow + @agent.follow_meta_refresh = follow + end + # A callback for additional certificate verification. See # OpenSSL::SSL::SSLContext#verify_callback - attr_accessor :verify_callback + # + # The callback can be used for debugging or to ignore errors by always + # returning +true+. Specifying nil uses the default method that was valid + # when the SSLContext was created + def verify_callback + @agent.verify_callback + end + def verify_callback= verify_callback + @agent.verify_callback = verify_callback + end + attr_accessor :history_added - attr_accessor :scheme_handlers - attr_accessor :redirection_limit + def redirection_limit + @agent.redirection_limit + end + + def redirection_limit= limit + @agent.redirection_limit = limit + end + + def scheme_handlers + @agent.scheme_handlers + end + + def scheme_handlers= scheme_handlers + @agent.scheme_handlers = scheme_handlers + end + # A hash of custom request headers - attr_accessor :request_headers + def request_headers + @agent.request_headers + end + def request_headers= request_headers + @agent.request_headers = request_headers + end + # Proxy settings attr_reader :proxy_addr attr_reader :proxy_pass attr_reader :proxy_port attr_reader :proxy_user # The HTML parser to be used when parsing documents attr_accessor :html_parser - attr_reader :http # :nodoc: + attr_reader :agent # :nodoc: - attr_reader :history + def history + @agent.history + end + attr_reader :pluggable_parser # A list of hooks to call after retrieving a response. Hooks are called with # the agent and the response returned. - attr_reader :post_connect_hooks + def post_connect_hooks + @agent.post_connect_hooks + end # A list of hooks to call before making a request. Hooks are called with # the agent and the request to be performed. - attr_reader :pre_connect_hooks + def pre_connect_hooks + @agent.pre_connect_hooks + end - alias :follow_redirect? :redirect_ok + alias follow_redirect? redirect_ok @html_parser = Nokogiri::HTML class << self attr_accessor :html_parser, :log @@ -165,127 +276,82 @@ child.log ||= log super end end + # A default encoding name used when parsing HTML parsing. When set it is + # used after any other encoding. The default is nil. + + attr_accessor :default_encoding + + # Overrides the encodings given by the HTTP server and the HTML page with + # the default_encoding when set to true. + attr_accessor :force_default_encoding + def initialize + @agent = Mechanize::HTTP::Agent.new + @agent.context = self + # attr_accessors - @cookie_jar = CookieJar.new - @log = nil - @open_timeout = nil - @read_timeout = nil - @user_agent = AGENT_ALIASES['Mechanize'] - @watch_for_set = nil - @history_added = nil - @ca_file = nil # OpenSSL server certificate file + @agent.user_agent = AGENT_ALIASES['Mechanize'] + @watch_for_set = nil + @history_added = nil - # callback for OpenSSL errors while verifying the server certificate - # chain, can be used for debugging or to ignore errors by always - # returning _true_ - # specifying nil uses the default method that was valid when the SSL was created - @verify_callback = nil - @cert = nil # OpenSSL Certificate - @key = nil # OpenSSL Private Key - @pass = nil # OpenSSL Password - @redirect_ok = true - @gzip_enabled = true - # attr_readers - @history = Mechanize::History.new @pluggable_parser = PluggableParser.new - # Auth variables - @user = nil # Auth User - @password = nil # Auth Password - @digest = nil # DigestAuth Digest - @digest_auth = Net::HTTP::DigestAuth.new - @auth_hash = {} # Keep track of urls for sending auth - @request_headers= {} # A hash of request headers to be used - - @conditional_requests = true - - @follow_meta_refresh = false - @redirection_limit = 20 - - @robots = false - @webrobots = nil - - # Connection Cache & Keep alive - @keep_alive_time = 300 @keep_alive = true # Proxy @proxy_addr = nil @proxy_port = nil @proxy_user = nil @proxy_pass = nil - @scheme_handlers = Hash.new { |h, scheme| - h[scheme] = lambda { |link, page| - raise Mechanize::UnsupportedSchemeError, scheme - } - } + @html_parser = self.class.html_parser - @scheme_handlers['http'] = lambda { |link, page| link } - @scheme_handlers['https'] = @scheme_handlers['http'] - @scheme_handlers['relative'] = @scheme_handlers['http'] - @scheme_handlers['file'] = @scheme_handlers['http'] + @default_encoding = nil + @force_default_encoding = false - @pre_connect_hooks = [] - @post_connect_hooks = [] + yield self if block_given? - @html_parser = self.class.html_parser + @agent.set_proxy @proxy_addr, @proxy_port, @proxy_user, @proxy_pass + @agent.set_http + end - yield self if block_given? + def max_history + @agent.history.max_size + end - if @proxy_addr and @proxy_pass then - set_proxy @proxy_addr, @proxy_port, @proxy_user, @proxy_pass - else - set_http - end + def max_history= length + @agent.history.max_size = length end - def max_history=(length); @history.max_size = length end - def max_history; @history.max_size end def log=(l); self.class.log = l end def log; self.class.log end - # Sets the proxy address, port, user, and password - # +addr+ should be a host, with no "http://" - def set_proxy(addr, port, user = nil, pass = nil) - proxy = URI.parse "http://#{addr}" - proxy.port = port - proxy.user = user if user - proxy.password = pass if pass - - set_http proxy - - nil + def user_agent= user_agent + @agent.user_agent = user_agent end - def user_agent=(value) - @webrobots = nil if value != @user_agent - @user_agent = value - end - - # Set the user agent for the Mechanize object. - # See AGENT_ALIASES + # Set the user agent for the Mechanize object. See AGENT_ALIASES def user_agent_alias=(al) - @user_agent = AGENT_ALIASES[al] || - raise(ArgumentError, "unknown agent alias") + self.user_agent = AGENT_ALIASES[al] || + raise(ArgumentError, "unknown agent alias #{al.inspect}") end # Returns a list of cookies stored in the cookie jar. def cookies - @cookie_jar.to_a + @agent.cookie_jar.to_a end # Sets the user and password to be used for authentication. def auth(user, password) - @user = user - @password = password + @agent.user = user + @agent.password = password end + alias :basic_auth :auth # Fetches the URL passed in and returns a page. def get(uri, parameters = [], referer = nil, headers = {}) method = :get @@ -300,17 +366,16 @@ referer = options[:referer] headers = options[:headers] method = options[:verb] || method end - unless referer + referer ||= if uri.to_s =~ %r{\Ahttps?://} - referer = Page.new(nil, {'content-type'=>'text/html'}) + Page.new(nil, {'content-type'=>'text/html'}) else - referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) + current_page || Page.new(nil, {'content-type'=>'text/html'}) end - end # FIXME: Huge hack so that using a URI as a referer works. I need to # refactor everything to pass around URIs but still support # Mechanize::Page#base unless referer.is_a?(Mechanize::File) @@ -319,11 +384,11 @@ Page.new(referer, {'content-type' => 'text/html'}) end # fetch the page headers ||= {} - page = fetch_page uri, method, headers, parameters, referer + page = @agent.fetch uri, method, headers, parameters, referer add_to_history(page) yield page if block_given? page end @@ -340,11 +405,11 @@ # DELETE to +url+ with +query_params+, and setting +headers+: # # delete('http://example/', {'q' => 'foo'}, {}) # def delete(uri, query_params = {}, headers = {}) - page = fetch_page(uri, :delete, headers, query_params) + page = @agent.fetch(uri, :delete, headers, query_params) add_to_history(page) page end ## @@ -352,11 +417,11 @@ # # head('http://example/', {'q' => 'foo'}, {}) # def head(uri, query_params = {}, headers = {}) # fetch the page - page = fetch_page(uri, :head, headers, query_params) + page = @agent.fetch(uri, :head, headers, query_params) yield page if block_given? page end # Fetch a file and return the contents of the file. @@ -369,16 +434,22 @@ # Mechanize::Page::Link object passed in. Returns the page fetched. def click(link) case link when Page::Link referer = link.page || current_page() - if robots + if @agent.robots if (referer.is_a?(Page) && referer.parser.nofollow?) || link.rel?('nofollow') raise RobotsDisallowedError.new(link.href) end end - get link.href, [], referer + if link.rel?('noreferrer') + href = @agent.resolve(link.href, link.page || current_page) + referer = Page.new(nil, {'content-type'=>'text/html'}) + else + href = link.href + end + get href, [], referer when String, Regexp if real_link = page.link_with(:text => link) click real_link else button = nil @@ -397,11 +468,11 @@ end # Equivalent to the browser back button. Returns the most recent page # visited. def back - @history.pop + @agent.history.pop end # Posts to the given URL with the request entity. The request # entity is specified by either a string, or a list of key-value # pairs represented by a hash or an array of arrays. @@ -466,515 +537,72 @@ headers = { 'Content-Type' => 'application/octet-stream', 'Content-Length' => entity.size.to_s, }.update headers - page = fetch_page uri, verb, headers, [entity], cur_page + page = @agent.fetch uri, verb, headers, [entity], cur_page add_to_history(page) page end # Returns the current page loaded by Mechanize def current_page - @history.last + @agent.current_page end - # Returns whether or not a url has been visited - def visited?(url) - ! visited_page(url).nil? - end - # Returns a visited page for the url passed in, otherwise nil def visited_page(url) - if url.respond_to? :href - url = url.href - end - @history.visited_page(resolve(url)) - end + url = url.href if url.respond_to? :href - # Runs given block, then resets the page history as it was before. self is - # given as a parameter to the block. Returns the value of the block. - def transact - history_backup = @history.dup - begin - yield self - ensure - @history = history_backup - end + @agent.visited_page url end - # Tests if this agent is allowed to access +url+, consulting the - # site's robots.txt. - def robots_allowed?(uri) - return true if uri.request_uri == '/robots.txt' + # Returns whether or not a url has been visited + alias visited? visited_page - webrobots.allowed?(uri) - end - - # Equivalent to !robots_allowed?(url). - def robots_disallowed?(url) - !webrobots.allowed?(url) - end - - # Returns an error object if there is an error in fetching or - # parsing robots.txt of the site +url+. - def robots_error(url) - webrobots.error(url) - end - - # Raises the error if there is an error in fetching or parsing - # robots.txt of the site +url+. - def robots_error!(url) - webrobots.error!(url) - end - - # Removes robots.txt cache for the site +url+. - def robots_reset(url) - webrobots.reset(url) - end - - alias :page :current_page - - def connection_for uri - case uri.scheme.downcase - when 'http', 'https' then - return @http - when 'file' then - return Mechanize::FileConnection.new - end - end - - def enable_gzip request - request['accept-encoding'] = if @gzip_enabled - 'gzip,deflate,identity' - else - 'identity' - end - end - - def http_request uri, method, params = nil - case uri.scheme.downcase - when 'http', 'https' then - klass = Net::HTTP.const_get(method.to_s.capitalize) - - request ||= klass.new(uri.request_uri) - request.body = params.first if params - - request - when 'file' then - Mechanize::FileRequest.new uri - end - end - - ## - # Invokes hooks added to post_connect_hooks after a +response+ is returned. - # Yields the +agent+ and the +response+ returned to each hook. - - def post_connect response # :yields: agent, response - @post_connect_hooks.each do |hook| - hook.call self, response - end - end - - ## - # Invokes hooks added to pre_connect_hooks before a +request+ is made. - # Yields the +agent+ and the +request+ that will be performed to each hook. - - def pre_connect request # :yields: agent, request - @pre_connect_hooks.each do |hook| - hook.call self, request - end - end - - def request_auth request, uri - auth_type = @auth_hash[uri.host] - - return unless auth_type - - case auth_type - when :basic - request.basic_auth @user, @password - when :digest, :iis_digest - uri.user = @user - uri.password = @password - - iis = auth_type == :iis_digest - - auth = @digest_auth.auth_header uri, @digest, request.method, iis - - request['Authorization'] = auth - end - end - - def request_cookies request, uri - return if @cookie_jar.empty? uri - - cookies = @cookie_jar.cookies uri - - return if cookies.empty? - - request.add_field 'Cookie', cookies.join('; ') - end - - def request_host request, uri - port = [80, 443].include?(uri.port.to_i) ? nil : uri.port - host = uri.host - - request['Host'] = [host, port].compact.join ':' - end - - def request_language_charset request - request['accept-charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.7' - request['accept-language'] = 'en-us,en;q=0.5' - end - - # Log specified headers for the request - def request_log request - return unless log - - log.info("#{request.class}: #{request.path}") - - request.each_header do |k, v| - log.debug("request-header: #{k} => #{v}") - end - end - - def request_add_headers request, headers = {} - @request_headers.each do |k,v| - request[k] = v - end - - headers.each do |field, value| - case field - when :etag then request["ETag"] = value - when :if_modified_since then request["If-Modified-Since"] = value - when Symbol then - raise ArgumentError, "unknown header symbol #{field}" - else - request[field] = value - end - end - end - - def request_referer request, uri, referer - return unless referer - return if 'https' == referer.scheme.downcase and - 'https' != uri.scheme.downcase - - request['Referer'] = referer - end - - def request_user_agent request - request['User-Agent'] = @user_agent if @user_agent - end - - def resolve(uri, referer = current_page()) - uri = uri.dup if uri.is_a?(URI) - - unless uri.is_a?(URI) - uri = uri.to_s.strip.gsub(/[^#{0.chr}-#{126.chr}]/o) { |match| - if RUBY_VERSION >= "1.9.0" - Mechanize::Util.uri_escape(match) - else - sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'C')[0]) - end - } - - unescaped = uri.split(/(?:%[0-9A-Fa-f]{2})+|#/) - escaped = uri.scan(/(?:%[0-9A-Fa-f]{2})+|#/) - - escaped_uri = Mechanize::Util.html_unescape( - unescaped.zip(escaped).map { |x,y| - "#{WEBrick::HTTPUtils.escape(x)}#{y}" - }.join('') - ) - - begin - uri = URI.parse(escaped_uri) - rescue - uri = URI.parse(WEBrick::HTTPUtils.escape(escaped_uri)) - end - end - - scheme = uri.relative? ? 'relative' : uri.scheme.downcase - uri = @scheme_handlers[scheme].call(uri, referer) - - if referer && referer.uri - if uri.path.length == 0 && uri.relative? - uri.path = referer.uri.path - end - end - - uri.path = '/' if uri.path.length == 0 - - if uri.relative? - raise ArgumentError, "absolute URL needed (not #{uri})" unless - referer && referer.uri - - base = nil - if referer.respond_to?(:bases) && referer.parser - base = referer.bases.last - end - - uri = ((base && base.uri && base.uri.absolute?) ? - base.uri : - referer.uri) + uri - uri = referer.uri + uri - # Strip initial "/.." bits from the path - uri.path.sub!(/^(\/\.\.)+(?=\/)/, '') - end - - unless ['http', 'https', 'file'].include?(uri.scheme.downcase) - raise ArgumentError, "unsupported scheme: #{uri.scheme}" - end - - uri - end - - def resolve_parameters uri, method, parameters - case method - when :head, :get, :delete, :trace then - if parameters and parameters.length > 0 - uri.query ||= '' - uri.query << '&' if uri.query.length > 0 - uri.query << Mechanize::Util.build_query_string(parameters) - end - - return uri, nil - end - - return uri, parameters - end - - def response_cookies response, uri, page - if Mechanize::Page === page and page.body =~ /Set-Cookie/n - page.search('//head/meta[@http-equiv="Set-Cookie"]').each do |meta| - Mechanize::Cookie.parse(uri, meta['content']) { |c| - log.debug("saved cookie: #{c}") if log - @cookie_jar.add(uri, c) - } - end - end - - header_cookies = response.get_fields 'Set-Cookie' - - return unless header_cookies - - header_cookies.each do |cookie| - Mechanize::Cookie.parse(uri, cookie) { |c| - log.debug("saved cookie: #{c}") if log - @cookie_jar.add(uri, c) - } - end - end - - def response_follow_meta_refresh response, uri, page, redirects - return unless @follow_meta_refresh - - redirect_uri = nil - referer = page - - if page.respond_to?(:meta) and (redirect = page.meta.first) - redirect_uri = Mechanize::Util.uri_unescape redirect.uri.to_s - sleep redirect.node['delay'].to_f - referer = Page.new(nil, {'content-type'=>'text/html'}) - elsif refresh = response['refresh'] - delay, redirect_uri = Page::Meta.parse(refresh, uri) - raise Mechanize::Error, 'Invalid refresh http header' unless delay - raise RedirectLimitReachedError.new(page, redirects) if - redirects + 1 > redirection_limit - sleep delay.to_f - end - - if redirect_uri - @history.push(page, page.uri) - fetch_page(redirect_uri, :get, {}, [], referer, redirects + 1) - end - end - - def response_log response - return unless log - - log.info("status: #{response.class} #{response.http_version} " \ - "#{response.code} #{response.message}") - - response.each_header do |k, v| - log.debug("response-header: #{k} => #{v}") - end - end - - def response_parse response, body, uri + def parse uri, response, body content_type = nil unless response['Content-Type'].nil? data, = response['Content-Type'].split ';', 2 content_type, = data.downcase.split ',', 2 unless data.nil? end # Find our pluggable parser - parser_klass = @pluggable_parser.parser(content_type) + parser_klass = @pluggable_parser.parser content_type - parser_klass.new(uri, response, body, response.code) { |parser| + parser_klass.new uri, response, body, response.code do |parser| parser.mech = self if parser.respond_to? :mech= - if @watch_for_set and parser.respond_to?(:watch_for_set=) - parser.watch_for_set = @watch_for_set - end - } - end - def response_read response, request - body = StringIO.new - body.set_encoding Encoding::BINARY if body.respond_to? :set_encoding - total = 0 - - response.read_body { |part| - total += part.length - body.write(part) - log.debug("Read #{part.length} bytes (#{total} total)") if log - } - - body.rewind - - raise Mechanize::ResponseCodeError, response if - Net::HTTPUnknownResponse === response - - content_length = response.content_length - - unless Net::HTTP::Head === request or Net::HTTPRedirection === response then - raise EOFError, "Content-Length (#{content_length}) does not match " \ - "response body length (#{body.length})" if - content_length and content_length != body.length + parser.watch_for_set = @watch_for_set if + @watch_for_set and parser.respond_to?(:watch_for_set=) end - - case response['Content-Encoding'] - when nil, 'none', '7bit' then - body.string - when 'deflate' then - log.debug('deflate body') if log - - if content_length > 0 or body.length > 0 then - begin - Zlib::Inflate.inflate body.string - rescue Zlib::BufError, Zlib::DataError - log.error('Unable to inflate page, retrying with raw deflate') if log - begin - Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.string) - rescue Zlib::BufError, Zlib::DataError - log.error("unable to inflate page: #{$!}") if log - '' - end - end - end - when 'gzip', 'x-gzip' then - log.debug('gzip body') if log - - if content_length > 0 or body.length > 0 then - begin - zio = Zlib::GzipReader.new body - zio.read - rescue Zlib::BufError, Zlib::GzipFile::Error - log.error('Unable to gunzip body, trying raw inflate') if log - body.rewind - body.read 10 - Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(body.read) - rescue Zlib::DataError - log.error("unable to gunzip page: #{$!}") if log - '' - ensure - zio.close if zio and not zio.closed? - end - end - else - raise Mechanize::Error, - "Unsupported Content-Encoding: #{response['Content-Encoding']}" - end end - def response_redirect response, method, page, redirects - case @redirect_ok - when true, :all - # shortcut - when false, nil - return page - when :permanent - return page if response_class != Net::HTTPMovedPermanently + # Runs given block, then resets the page history as it was before. self is + # given as a parameter to the block. Returns the value of the block. + def transact + history_backup = @agent.history.dup + begin + yield self + ensure + @agent.history = history_backup end - - log.info("follow redirect to: #{response['Location']}") if log - - from_uri = page.uri - - raise RedirectLimitReachedError.new(page, redirects) if - redirects + 1 > redirection_limit - - redirect_method = method == :head ? :head : :get - - page = fetch_page(response['Location'].to_s, redirect_method, {}, [], - page, redirects + 1) - - @history.push(page, from_uri) - - return page end - def response_authenticate(response, page, uri, request, headers, params, - referer) - raise ResponseCodeError, page unless @user || @password - raise ResponseCodeError, page if @auth_hash.has_key?(uri.host) - - if response['www-authenticate'] =~ /Digest/i - @auth_hash[uri.host] = :digest - if response['server'] =~ /Microsoft-IIS/ - @auth_hash[uri.host] = :iis_digest - end - @digest = response['www-authenticate'] - else - @auth_hash[uri.host] = :basic - end - - fetch_page(uri, request.method.downcase.to_sym, headers, params, referer) + def robots + @agent.robots end - private - - def webrobots_http_get(uri) - get_file(uri) - rescue Mechanize::ResponseCodeError => e - return '' if e.response_code == '404' - raise e + def robots= enabled + @agent.robots = enabled end - def webrobots - @webrobots ||= WebRobots.new(@user_agent, :http_get => method(:webrobots_http_get)) - end + alias :page :current_page - def set_http proxy = nil - @http = Net::HTTP::Persistent.new 'mechanize', proxy + private - @http.keep_alive = @keep_alive_time - - @http.ca_file = @ca_file - @http.verify_callback = @verify_callback - - if @cert and @key then - cert = if OpenSSL::X509::Certificate === @cert then - @cert - else - OpenSSL::X509::Certificate.new ::File.read @cert - end - - key = if OpenSSL::PKey::PKey === @key then - @key - else - OpenSSL::PKey::RSA.new ::File.read(@key), @pass - end - - @http.certificate = cert - @http.private_key = key - end - end - def post_form(uri, form, headers = {}) cur_page = form.page || current_page || Page.new(nil, {'content-type'=>'text/html'}) request_data = form.request_data @@ -985,103 +613,20 @@ 'Content-Type' => form.enctype, 'Content-Length' => request_data.size.to_s, }.merge headers # fetch the page - page = fetch_page uri, :post, headers, [request_data], cur_page + page = @agent.fetch uri, :post, headers, [request_data], cur_page add_to_history(page) page end - # uri is an absolute URI - def fetch_page uri, method = :get, headers = {}, params = [], - referer = current_page, redirects = 0 - referer_uri = referer ? referer.uri : nil - - uri = resolve uri, referer - - uri, params = resolve_parameters uri, method, params - - request = http_request uri, method, params - - connection = connection_for uri - - request_auth request, uri - - enable_gzip request - - request_language_charset request - request_cookies request, uri - request_host request, uri - request_referer request, uri, referer_uri - request_user_agent request - request_add_headers request, headers - - pre_connect request - - # Consult robots.txt - if robots && uri.is_a?(URI::HTTP) - robots_allowed?(uri) or raise RobotsDisallowedError.new(uri) - end - - # Add If-Modified-Since if page is in history - if (page = visited_page(uri)) and page.response['Last-Modified'] - request['If-Modified-Since'] = page.response['Last-Modified'] - end if(@conditional_requests) - - # Specify timeouts if given - connection.open_timeout = @open_timeout if @open_timeout - connection.read_timeout = @read_timeout if @read_timeout - - request_log request - - response_body = nil - - # Send the request - response = connection.request(uri, request) { |res| - response_log res - - response_body = response_read res, request - - res - } - - post_connect response - - page = response_parse response, response_body, uri - - response_cookies response, uri, page - - meta = response_follow_meta_refresh response, uri, page, redirects - return meta if meta - - case response - when Net::HTTPSuccess - if robots && page.is_a?(Page) - page.parser.noindex? and raise RobotsDisallowedError.new(uri) - end - - page - when Mechanize::FileResponse - page - when Net::HTTPNotModified - log.debug("Got cached page") if log - visited_page(uri) || page - when Net::HTTPRedirection - response_redirect response, method, page, redirects - when Net::HTTPUnauthorized - response_authenticate(response, page, uri, request, headers, params, - referer) - else - raise ResponseCodeError.new(page), "Unhandled response" - end - end - def add_to_history(page) - @history.push(page, resolve(page.uri)) - history_added.call(page) if history_added + @agent.history.push(page, @agent.resolve(page.uri)) + @history_added.call(page) if @history_added end + end require 'mechanize/content_type_error' require 'mechanize/cookie' require 'mechanize/cookie_jar' @@ -1089,16 +634,19 @@ require 'mechanize/file_connection' require 'mechanize/file_request' require 'mechanize/file_response' require 'mechanize/form' require 'mechanize/history' +require 'mechanize/http' +require 'mechanize/http/agent' require 'mechanize/page' require 'mechanize/inspect' require 'mechanize/monkey_patch' require 'mechanize/pluggable_parsers' require 'mechanize/redirect_limit_reached_error' require 'mechanize/redirect_not_get_or_head_error' require 'mechanize/response_code_error' +require 'mechanize/response_read_error' require 'mechanize/robots_disallowed_error' require 'mechanize/unsupported_scheme_error' require 'mechanize/util'