lib/mechanize.rb in tenderlove-mechanize-0.9.3.20090623142847 vs lib/mechanize.rb in tenderlove-mechanize-0.9.3.20090911221705
- old
+ new
@@ -1,7 +1,621 @@
-# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
-# Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org)
+require 'net/http'
+require 'net/https'
+require 'uri'
+require 'webrick/httputils'
+require 'zlib'
+require 'stringio'
+require 'digest/md5'
+require 'fileutils'
+require 'nokogiri'
+require 'forwardable'
+require 'iconv'
+require 'nkf'
+
+require 'mechanize/util'
+require 'mechanize/content_type_error'
+require 'mechanize/response_code_error'
+require 'mechanize/unsupported_scheme_error'
+require 'mechanize/redirect_limit_reached_error'
+require 'mechanize/redirect_not_get_or_head_error'
+require 'mechanize/cookie'
+require 'mechanize/cookie_jar'
+require 'mechanize/history'
+require 'mechanize/form'
+require 'mechanize/pluggable_parsers'
+require 'mechanize/file_response'
+require 'mechanize/inspect'
+require 'mechanize/chain'
+require 'mechanize/monkey_patch'
+
+# = Synopsis
+# The Mechanize library is used for automating interaction with a website. It
+# can follow links, and submit forms. Form fields can be populated and
+# submitted. A history of URL's is maintained and can be queried.
#
-# Please see the LICENSE file for licensing.
+# == Example
+# require 'rubygems'
+# require 'mechanize'
+# require 'logger'
+#
+# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
+# agent.user_agent_alias = 'Mac Safari'
+# page = agent.get("http://www.google.com/")
+# search_form = page.form_with(:name => "f")
+# search_form.field_with(:name => "q").value = "Hello"
+# search_results = agent.submit(search_form)
+# puts search_results.body
+class Mechanize
+ ##
+ # The version of Mechanize you are using.
+ VERSION = '0.9.3'
+ ##
+ # User Agent aliases
+ AGENT_ALIASES = {
+ 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
+ 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
+ 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
+ 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
+ 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
+ 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
+ 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
+ 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3',
+ 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
+ }
-require 'www/mechanize'
+ attr_accessor :cookie_jar
+ attr_accessor :open_timeout, :read_timeout
+ attr_accessor :user_agent
+ attr_accessor :watch_for_set
+ attr_accessor :ca_file
+ attr_accessor :key
+ attr_accessor :cert
+ attr_accessor :pass
+ attr_accessor :redirect_ok
+ attr_accessor :keep_alive_time
+ attr_accessor :keep_alive
+ attr_accessor :conditional_requests
+ attr_accessor :follow_meta_refresh
+ attr_accessor :verify_callback
+ attr_accessor :history_added
+ attr_accessor :scheme_handlers
+ attr_accessor :redirection_limit
+
+ # A hash of custom request headers
+ attr_accessor :request_headers
+
+ # The HTML parser to be used when parsing documents
+ attr_accessor :html_parser
+
+ attr_reader :history
+ attr_reader :pluggable_parser
+
+ alias :follow_redirect? :redirect_ok
+
+ @html_parser = Nokogiri::HTML
+ class << self; attr_accessor :html_parser, :log end
+
+ def initialize
+ # attr_accessors
+ @cookie_jar = CookieJar.new
+ @log = nil
+ @open_timeout = nil
+ @read_timeout = nil
+ @user_agent = AGENT_ALIASES['Mechanize']
+ @watch_for_set = nil
+ @history_added = nil
+ @ca_file = nil # OpenSSL server certificate file
+
+ # callback for OpenSSL errors while verifying the server certificate
+ # chain, can be used for debugging or to ignore errors by always
+ # returning _true_
+ @verify_callback = nil
+ @cert = nil # OpenSSL Certificate
+ @key = nil # OpenSSL Private Key
+ @pass = nil # OpenSSL Password
+ @redirect_ok = true # Should we follow redirects?
+
+ # attr_readers
+ @history = Mechanize::History.new
+ @pluggable_parser = PluggableParser.new
+
+ # Auth variables
+ @user = nil # Auth User
+ @password = nil # Auth Password
+ @digest = nil # DigestAuth Digest
+ @auth_hash = {} # Keep track of urls for sending auth
+ @request_headers= {} # A hash of request headers to be used
+
+ # Proxy settings
+ @proxy_addr = nil
+ @proxy_pass = nil
+ @proxy_port = nil
+ @proxy_user = nil
+
+ @conditional_requests = true
+
+ @follow_meta_refresh = false
+ @redirection_limit = 20
+
+ # Connection Cache & Keep alive
+ @connection_cache = {}
+ @keep_alive_time = 300
+ @keep_alive = true
+
+ @scheme_handlers = Hash.new { |h,k|
+ h[k] = lambda { |link, page|
+ raise UnsupportedSchemeError.new(k)
+ }
+ }
+ @scheme_handlers['http'] = lambda { |link, page| link }
+ @scheme_handlers['https'] = @scheme_handlers['http']
+ @scheme_handlers['relative'] = @scheme_handlers['http']
+ @scheme_handlers['file'] = @scheme_handlers['http']
+
+ @pre_connect_hook = Chain::PreConnectHook.new
+ @post_connect_hook = Chain::PostConnectHook.new
+
+ @html_parser = self.class.html_parser
+
+ yield self if block_given?
+ end
+
+ def max_history=(length); @history.max_size = length end
+ def max_history; @history.max_size end
+ def log=(l); self.class.log = l end
+ def log; self.class.log end
+
+ def pre_connect_hooks
+ @pre_connect_hook.hooks
+ end
+
+ def post_connect_hooks
+ @post_connect_hook.hooks
+ end
+
+ # Sets the proxy address, port, user, and password
+ # +addr+ should be a host, with no "http://"
+ def set_proxy(addr, port, user = nil, pass = nil)
+ @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
+ end
+
+ # Set the user agent for the Mechanize object.
+ # See AGENT_ALIASES
+ def user_agent_alias=(al)
+ self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
+ end
+
+ # Returns a list of cookies stored in the cookie jar.
+ def cookies
+ @cookie_jar.to_a
+ end
+
+ # Sets the user and password to be used for authentication.
+ def auth(user, password)
+ @user = user
+ @password = password
+ end
+ alias :basic_auth :auth
+
+ # Fetches the URL passed in and returns a page.
+ def get(options, parameters = [], referer = nil)
+ unless options.is_a? Hash
+ url = options
+ unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0
+ referer = parameters
+ parameters = []
+ end
+ else
+ raise ArgumentError.new("url must be specified") unless url = options[:url]
+ parameters = options[:params] || []
+ referer = options[:referer]
+ headers = options[:headers]
+ end
+
+ unless referer
+ if url.to_s =~ /^http/
+ referer = Page.new(nil, {'content-type'=>'text/html'})
+ else
+ referer = current_page || Page.new(nil, {'content-type'=>'text/html'})
+ end
+ end
+
+ # FIXME: Huge hack so that using a URI as a referer works. I need to
+ # refactor everything to pass around URIs but still support
+ # Mechanize::Page#base
+ unless referer.is_a?(Mechanize::File)
+ referer = referer.is_a?(String) ?
+ Page.new(URI.parse(referer), {'content-type' => 'text/html'}) :
+ Page.new(referer, {'content-type' => 'text/html'})
+ end
+
+ # fetch the page
+ page = fetch_page( :uri => url,
+ :referer => referer,
+ :headers => headers || {},
+ :params => parameters
+ )
+ add_to_history(page)
+ yield page if block_given?
+ page
+ end
+
+ ####
+ # PUT to +url+ with +entity+, and setting +options+:
+ #
+ # put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'})
+ #
+ def put(url, entity, options = {})
+ request_with_entity(:put, url, entity, options)
+ end
+
+ ####
+ # DELETE to +url+ with +query_params+, and setting +options+:
+ #
+ # delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
+ #
+ def delete(url, query_params = {}, options = {})
+ page = head(url, query_params, options.merge({:verb => :delete}))
+ add_to_history(page)
+ page
+ end
+
+ ####
+ # HEAD to +url+ with +query_params+, and setting +options+:
+ #
+ # head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {})
+ #
+ def head(url, query_params = {}, options = {})
+ options = {
+ :uri => url,
+ :headers => {},
+ :params => query_params,
+ :verb => :head
+ }.merge(options)
+ # fetch the page
+ page = fetch_page(options)
+ yield page if block_given?
+ page
+ end
+
+ # Fetch a file and return the contents of the file.
+ def get_file(url)
+ get(url).body
+ end
+
+ # Clicks the Mechanize::Link object passed in and returns the
+ # page fetched.
+ def click(link)
+ referer = link.page rescue referer = nil
+ href = link.respond_to?(:href) ? link.href :
+ (link['href'] || link['src'])
+ get(:url => href, :referer => (referer || current_page()))
+ end
+
+ # Equivalent to the browser back button. Returns the most recent page
+ # visited.
+ def back
+ @history.pop
+ end
+
+ # Posts to the given URL with the request entity. The request
+ # entity is specified by either a string, or a list of key-value
+ # pairs represented by a hash or an array of arrays.
+ #
+ # Examples:
+ # agent.post('http://example.com/', "foo" => "bar")
+ #
+ # agent.post('http://example.com/', [ ["foo", "bar"] ])
+ #
+ # agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml')
+ def post(url, query={}, headers={})
+ if query.is_a?(String)
+ return request_with_entity(:post, url, query, :headers => headers)
+ end
+ node = {}
+ # Create a fake form
+ class << node
+ def search(*args); []; end
+ end
+ node['method'] = 'POST'
+ node['enctype'] = 'application/x-www-form-urlencoded'
+
+ form = Form.new(node)
+ query.each { |k,v|
+ if v.is_a?(IO)
+ form.enctype = 'multipart/form-data'
+ ul = Form::FileUpload.new(k.to_s,::File.basename(v.path))
+ ul.file_data = v.read
+ form.file_uploads << ul
+ else
+ form.fields << Form::Field.new(k.to_s,v)
+ end
+ }
+ post_form(url, form, headers)
+ end
+
+ # Submit a form with an optional button.
+ # Without a button:
+ # page = agent.get('http://example.com')
+ # agent.submit(page.forms.first)
+ # With a button
+ # agent.submit(page.forms.first, page.forms.first.buttons.first)
+ def submit(form, button=nil, headers={})
+ form.add_button_to_query(button) if button
+ case form.method.upcase
+ when 'POST'
+ post_form(form.action, form, headers)
+ when 'GET'
+ get( :url => form.action.gsub(/\?[^\?]*$/, ''),
+ :params => form.build_query,
+ :headers => headers,
+ :referer => form.page
+ )
+ else
+ raise "unsupported method: #{form.method.upcase}"
+ end
+ end
+
+ def request_with_entity(verb, url, entity, options={})
+ cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'})
+
+ options = {
+ :uri => url,
+ :referer => cur_page,
+ :headers => {},
+ }.update(options)
+
+ headers = {
+ 'Content-Type' => 'application/octet-stream',
+ 'Content-Length' => entity.size.to_s,
+ }.update(options[:headers])
+
+ options.update({
+ :verb => verb,
+ :params => [entity],
+ :headers => headers,
+ })
+
+ page = fetch_page(options)
+ add_to_history(page)
+ page
+ end
+
+ # Returns the current page loaded by Mechanize
+ def current_page
+ @history.last
+ end
+
+ # Returns whether or not a url has been visited
+ def visited?(url)
+ ! visited_page(url).nil?
+ end
+
+ # Returns a visited page for the url passed in, otherwise nil
+ def visited_page(url)
+ if url.respond_to? :href
+ url = url.href
+ end
+ @history.visited_page(resolve(url))
+ end
+
+ # Runs given block, then resets the page history as it was before. self is
+ # given as a parameter to the block. Returns the value of the block.
+ def transact
+ history_backup = @history.dup
+ begin
+ yield self
+ ensure
+ @history = history_backup
+ end
+ end
+
+ alias :page :current_page
+
+ private
+
+ def resolve(url, referer = current_page())
+ hash = { :uri => url, :referer => referer }
+ chain = Chain.new([
+ Chain::URIResolver.new(@scheme_handlers)
+ ]).handle(hash)
+ hash[:uri].to_s
+ end
+
+ def post_form(url, form, headers = {})
+ cur_page = form.page || current_page ||
+ Page.new( nil, {'content-type'=>'text/html'})
+
+ request_data = form.request_data
+
+ log.debug("query: #{ request_data.inspect }") if log
+
+ # fetch the page
+ page = fetch_page( :uri => url,
+ :referer => cur_page,
+ :verb => :post,
+ :params => [request_data],
+ :headers => {
+ 'Content-Type' => form.enctype,
+ 'Content-Length' => request_data.size.to_s,
+ }.merge(headers))
+ add_to_history(page)
+ page
+ end
+
+ # uri is an absolute URI
+ def fetch_page(params)
+ options = {
+ :request => nil,
+ :response => nil,
+ :connection => nil,
+ :referer => current_page(),
+ :uri => nil,
+ :verb => :get,
+ :agent => self,
+ :redirects => 0,
+ :params => [],
+ :headers => {},
+ }.merge(params)
+
+ before_connect = Chain.new([
+ Chain::URIResolver.new(@scheme_handlers),
+ Chain::ParameterResolver.new,
+ Chain::RequestResolver.new,
+ Chain::ConnectionResolver.new(
+ @connection_cache,
+ @keep_alive,
+ @proxy_addr,
+ @proxy_port,
+ @proxy_user,
+ @proxy_pass
+ ),
+ Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass),
+ Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest),
+ Chain::HeaderResolver.new(
+ @keep_alive,
+ @keep_alive_time,
+ @cookie_jar,
+ @user_agent,
+ @request_headers
+ ),
+ Chain::CustomHeaders.new,
+ @pre_connect_hook,
+ ])
+ before_connect.handle(options)
+
+ uri = options[:uri]
+ request = options[:request]
+ cur_page = options[:referer]
+ request_data = options[:params]
+ redirects = options[:redirects]
+ http_obj = options[:connection]
+
+ # Add If-Modified-Since if page is in history
+ if( (page = visited_page(uri)) && page.response['Last-Modified'] )
+ request['If-Modified-Since'] = page.response['Last-Modified']
+ end if(@conditional_requests)
+
+ # Specify timeouts if given
+ http_obj.open_timeout = @open_timeout if @open_timeout
+ http_obj.read_timeout = @read_timeout if @read_timeout
+ http_obj.start unless http_obj.started?
+
+ # Log specified headers for the request
+ log.info("#{ request.class }: #{ request.path }") if log
+ request.each_header do |k, v|
+ log.debug("request-header: #{ k } => #{ v }")
+ end if log
+
+ # Send the request
+ attempts = 0
+ begin
+ response = http_obj.request(request, *request_data) { |r|
+ connection_chain = Chain.new([
+ Chain::ResponseReader.new(r),
+ Chain::BodyDecodingHandler.new,
+ ])
+ connection_chain.handle(options)
+ }
+ rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x
+ log.error("Rescuing EOF error") if log
+ http_obj.finish
+ raise x if attempts >= 2
+ request.body = nil
+ http_obj.start
+ attempts += 1
+ retry
+ end
+
+ after_connect = Chain.new([
+ @post_connect_hook,
+ Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set),
+ Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache),
+ ])
+ after_connect.handle(options)
+
+ res_klass = options[:res_klass]
+ response_body = options[:response_body]
+ page = options[:page]
+
+ log.info("status: #{ page.code }") if log
+
+ if follow_meta_refresh
+ redirect_uri = nil
+ referer = page
+ if (page.respond_to?(:meta) && (redirect = page.meta.first))
+ redirect_uri = redirect.uri.to_s
+ sleep redirect.node['delay'].to_f
+ referer = Page.new(nil, {'content-type'=>'text/html'})
+ elsif refresh = response['refresh']
+ delay, redirect_uri = Page::Meta.parse(refresh, uri)
+ raise StandardError, "Invalid refresh http header" unless delay
+ if redirects + 1 > redirection_limit
+ raise RedirectLimitReachedError.new(page, redirects)
+ end
+ sleep delay.to_f
+ end
+ if redirect_uri
+ @history.push(page, page.uri)
+ return fetch_page(
+ :uri => redirect_uri,
+ :referer => referer,
+ :params => [],
+ :verb => :get,
+ :redirects => redirects + 1
+ )
+ end
+ end
+
+ return page if res_klass <= Net::HTTPSuccess
+
+ if res_klass == Net::HTTPNotModified
+ log.debug("Got cached page") if log
+ return visited_page(uri) || page
+ elsif res_klass <= Net::HTTPRedirection
+ return page unless follow_redirect?
+ log.info("follow redirect to: #{ response['Location'] }") if log
+ from_uri = page.uri
+ raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit
+ redirect_verb = options[:verb] == :head ? :head : :get
+ page = fetch_page( :uri => response['Location'].to_s,
+ :referer => page,
+ :params => [],
+ :verb => redirect_verb,
+ :redirects => redirects + 1
+ )
+ @history.push(page, from_uri)
+ return page
+ elsif res_klass <= Net::HTTPUnauthorized
+ raise ResponseCodeError.new(page) unless @user || @password
+ raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
+ if response['www-authenticate'] =~ /Digest/i
+ @auth_hash[uri.host] = :digest
+ if response['server'] =~ /Microsoft-IIS/
+ @auth_hash[uri.host] = :iis_digest
+ end
+ @digest = response['www-authenticate']
+ else
+ @auth_hash[uri.host] = :basic
+ end
+ return fetch_page( :uri => uri,
+ :referer => cur_page,
+ :verb => request.method.downcase.to_sym,
+ :params => request_data,
+ :headers => options[:headers]
+ )
+ end
+
+ raise ResponseCodeError.new(page), "Unhandled response", caller
+ end
+
+ def add_to_history(page)
+ @history.push(page, resolve(page.uri))
+ history_added.call(page) if history_added
+ end
+end
+
+module WWW; end
+WWW::Mechanize = ::Mechanize
+