lib/mechanize.rb in mechanize-0.6.11 vs lib/mechanize.rb in mechanize-0.7.0
- old
+ new
@@ -1,657 +1,7 @@
-# Original Code:
# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
+# Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org)
#
-# New Code:
-# Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org)
-#
# Please see the LICENSE file for licensing.
-#
-# required due to the missing get_fields method in Ruby 1.8.2
-unless RUBY_VERSION > "1.8.2"
- $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
-end
-require 'net/http'
-require 'net/https'
-
-# Monkey patch for ruby 1.8.4
-unless RUBY_VERSION > "1.8.4"
-module Net # :nodoc:
- class HTTPResponse # :nodoc:
- CODE_TO_OBJ['500'] = HTTPInternalServerError
- end
-end
-end
-
-require 'uri'
-require 'webrick/httputils'
-require 'zlib'
-require 'stringio'
-require 'digest/md5'
-require 'mechanize/monkey_patch'
-require 'mechanize/cookie'
-require 'mechanize/errors'
-require 'mechanize/pluggable_parsers'
-require 'mechanize/form'
-require 'mechanize/form_elements'
-require 'mechanize/history'
-require 'mechanize/list'
-require 'mechanize/page'
-require 'mechanize/page_elements'
-require 'mechanize/inspect'
-
-module WWW
-
-# = Synopsis
-# The Mechanize library is used for automating interaction with a website. It
-# can follow links, and submit forms. Form fields can be populated and
-# submitted. A history of URL's is maintained and can be queried.
-#
-# == Example
-# require 'rubygems'
-# require 'mechanize'
-# require 'logger'
-#
-# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
-# agent.user_agent_alias = 'Mac Safari'
-# page = agent.get("http://www.google.com/")
-# search_form = page.forms.name("f").first
-# search_form.fields.name("q").value = "Hello"
-# search_results = agent.submit(search_form)
-# puts search_results.body
-class Mechanize
- ##
- # The version of Mechanize you are using.
-
- VERSION = '0.6.11'
-
- ##
- # User Agent aliases
- AGENT_ALIASES = {
- 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
- 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
- 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
- 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
- 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
- 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
- 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
- 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
- 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
- }
-
- attr_accessor :cookie_jar
- attr_accessor :log
- attr_accessor :open_timeout, :read_timeout
- attr_accessor :user_agent
- attr_accessor :watch_for_set
- attr_accessor :ca_file
- attr_accessor :key
- attr_accessor :cert
- attr_accessor :pass
- attr_accessor :redirect_ok
- attr_accessor :keep_alive_time
- attr_accessor :keep_alive
- attr_accessor :conditional_requests
- attr_accessor :follow_meta_refresh
-
- attr_reader :history
- attr_reader :pluggable_parser
-
- alias :follow_redirect? :redirect_ok
-
- @@nonce_count = -1
- CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535)))
-
- def initialize
- # attr_accessors
- @cookie_jar = CookieJar.new
- @log = nil
- @open_timeout = nil
- @read_timeout = nil
- @user_agent = AGENT_ALIASES['Mechanize']
- @watch_for_set = nil
- @ca_file = nil
- @cert = nil # OpenSSL Certificate
- @key = nil # OpenSSL Private Key
- @pass = nil # OpenSSL Password
- @redirect_ok = true # Should we follow redirects?
-
- # attr_readers
- @history = WWW::Mechanize::History.new
- @pluggable_parser = PluggableParser.new
-
- # Auth variables
- @user = nil # Auth User
- @password = nil # Auth Password
- @digest = nil # DigestAuth Digest
- @auth_hash = {} # Keep track of urls for sending auth
-
- # Proxy settings
- @proxy_addr = nil
- @proxy_pass = nil
- @proxy_port = nil
- @proxy_user = nil
-
- @conditional_requests = true
-
- @follow_meta_refresh = false
-
- # Connection Cache & Keep alive
- @connection_cache = {}
- @keep_alive_time = 300
- @keep_alive = true
-
- yield self if block_given?
- end
-
- def max_history=(length); @history.max_size = length; end
- def max_history; @history.max_size; end
-
- # Sets the proxy address, port, user, and password
- def set_proxy(addr, port, user = nil, pass = nil)
- @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass
- end
-
- # Set the user agent for the Mechanize object.
- # See AGENT_ALIASES
- def user_agent_alias=(al)
- self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
- end
-
- # Returns a list of cookies stored in the cookie jar.
- def cookies
- @cookie_jar.to_a
- end
-
- # Sets the user and password to be used for basic authentication.
- def basic_auth(user, password)
- auth(user, password)
- end
-
- def auth(user, password)
- @user = user
- @password = password
- end
-
- # Fetches the URL passed in and returns a page.
- def get(url, referer=nil, &block)
- cur_page = referer || current_page ||
- Page.new( nil, {'content-type'=>'text/html'})
-
- # fetch the page
- abs_uri = to_absolute_uri(url, cur_page)
- request = fetch_request(abs_uri)
- page = fetch_page(abs_uri, request, cur_page, &block)
- add_to_history(page)
- page
- end
-
- # Fetch a file and return the contents of the file.
- def get_file(url)
- get(url).body
- end
-
-
- # Clicks the WWW::Mechanize::Link object passed in and returns the
- # page fetched.
- def click(link)
- referer =
- begin
- link.page
- rescue
- nil
- end
- uri = to_absolute_uri(
- link.attributes['href'] || link.attributes['src'] || link.href,
- referer || current_page()
- )
- get(uri, referer)
- end
-
- # Equivalent to the browser back button. Returns the most recent page
- # visited.
- def back
- @history.pop
- end
-
- # Posts to the given URL wht the query parameters passed in. Query
- # parameters can be passed as a hash, or as an array of arrays.
- # Example:
- # agent.post('http://example.com/', "foo" => "bar")
- # or
- # agent.post('http://example.com/', [ ["foo", "bar"] ])
- def post(url, query={})
- node = Hpricot::Elem.new(Hpricot::STag.new('form'))
- node['method'] = 'POST'
- node['enctype'] = 'application/x-www-form-urlencoded'
-
- form = Form.new(node)
- query.each { |k,v|
- form.fields << Field.new(k,v)
- }
- post_form(url, form)
- end
-
- # Submit a form with an optional button.
- # Without a button:
- # page = agent.get('http://example.com')
- # agent.submit(page.forms.first)
- # With a button
- # agent.submit(page.forms.first, page.forms.first.buttons.first)
- def submit(form, button=nil)
- form.add_button_to_query(button) if button
- uri = to_absolute_uri(form.action, form.page)
- case form.method.upcase
- when 'POST'
- post_form(uri, form)
- when 'GET'
- uri.query = WWW::Mechanize.build_query_string(form.build_query)
- get(uri)
- else
- raise "unsupported method: #{form.method.upcase}"
- end
- end
-
- # Returns the current page loaded by Mechanize
- def current_page
- @history.last
- end
-
- # Returns whether or not a url has been visited
- def visited?(url)
- ! visited_page(url).nil?
- end
-
- # Returns a visited page for the url passed in, otherwise nil
- def visited_page(url)
- if url.respond_to? :href
- url = url.href
- end
- @history.visited_page(to_absolute_uri(url))
- end
-
- # Runs given block, then resets the page history as it was before. self is
- # given as a parameter to the block. Returns the value of the block.
- def transact
- history_backup = @history.dup
- begin
- yield self
- ensure
- @history = history_backup
- end
- end
-
- alias :page :current_page
-
- protected
- def set_headers(uri, request, cur_page)
- if @keep_alive
- request.add_field('Connection', 'keep-alive')
- request.add_field('Keep-Alive', keep_alive_time.to_s)
- else
- request.add_field('Connection', 'close')
- end
- request.add_field('Accept-Encoding', 'gzip,identity')
- request.add_field('Accept-Language', 'en-us,en;q0.5')
- request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7')
-
- unless @cookie_jar.empty?(uri)
- cookies = @cookie_jar.cookies(uri)
- cookie = cookies.length > 0 ? cookies.join("; ") : nil
- if log
- cookies.each do |c|
- log.debug("using cookie: #{c}")
- end
- end
- request.add_field('Cookie', cookie)
- end
-
- # Add Referer header to request
- unless cur_page.uri.nil?
- request.add_field('Referer', cur_page.uri.to_s)
- end
-
- # Add User-Agent header to request
- request.add_field('User-Agent', @user_agent) if @user_agent
-
- # Add If-Modified-Since if page is in history
- if @conditional_requests
- if( (page = visited_page(uri)) && page.response['Last-Modified'] )
- request.add_field('If-Modified-Since', page.response['Last-Modified'])
- end
- end
-
- if( @auth_hash[uri.host] )
- case @auth_hash[uri.host]
- when :basic
- request.basic_auth(@user, @password)
- when :digest
- @digest_response ||= nil
- @digest_response = self.gen_auth_header(uri,request,@digest) if @digest
- request.add_field('Authorization', @digest_response) if @digest_response
- end
- end
-
- request
- end
-
- def gen_auth_header(uri, request, auth_header, is_IIS = false)
- @@nonce_count += 1
-
- user = @digest_user
- password = @digest_password
-
- auth_header =~ /^(\w+) (.*)/
-
- params = {}
- $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 }
-
- a_1 = "#{@user}:#{params['realm']}:#{@password}"
- a_2 = "#{request.method}:#{uri.path}"
- request_digest = ''
- request_digest << Digest::MD5.hexdigest(a_1)
- request_digest << ':' << params['nonce']
- request_digest << ':' << ('%08x' % @@nonce_count)
- request_digest << ':' << CNONCE
- request_digest << ':' << params['qop']
- request_digest << ':' << Digest::MD5.hexdigest(a_2)
-
- header = ''
- header << "Digest username=\"#{@user}\", "
- header << "realm=\"#{params['realm']}\", "
- if is_IIS then
- header << "qop=\"#{params['qop']}\", "
- else
- header << "qop=#{params['qop']}, "
- end
- header << "uri=\"#{uri.path}\", "
- header << "algorithm=MD5, "
- header << "nonce=\"#{params['nonce']}\", "
- header << "nc=#{'%08x' % @@nonce_count}, "
- header << "cnonce=\"#{CNONCE}\", "
- header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\""
-
- return header
- end
-
- private
-
- def to_absolute_uri(url, cur_page=current_page())
- unless url.is_a? URI
- url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match|
- sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0])
- }
-
- url = URI.parse(
- Util.html_unescape(
- url.split(/%[0-9A-Fa-f]{2}|#/).zip(
- url.scan(/%[0-9A-Fa-f]{2}|#/)
- ).map { |x,y|
- "#{URI.escape(x)}#{y}"
- }.join('')
- )
- )
- end
-
- url.path = '/' if url.path.length == 0
-
- # construct an absolute uri
- if url.relative?
- raise 'no history. please specify an absolute URL' unless cur_page.uri
- base = cur_page.respond_to?(:bases) ? cur_page.bases.last : nil
- url = ((base && base.uri && base.uri.absolute?) ?
- base.uri :
- cur_page.uri) + url
- url = cur_page.uri + url
- # Strip initial "/.." bits from the path
- url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
- end
-
- return url
- end
-
- def post_form(url, form)
- cur_page = form.page || current_page ||
- Page.new( nil, {'content-type'=>'text/html'})
-
- request_data = form.request_data
-
- abs_url = to_absolute_uri(url, cur_page)
- request = fetch_request(abs_url, :post)
- request.add_field('Content-Type', form.enctype)
- request.add_field('Content-Length', request_data.size.to_s)
-
- log.debug("query: #{ request_data.inspect }") if log
-
- # fetch the page
- page = fetch_page(abs_url, request, cur_page, [request_data])
- add_to_history(page)
- page
- end
-
- # Creates a new request object based on the scheme and type
- def fetch_request(uri, type = :get)
- raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
- if type == :get
- Net::HTTP::Get.new(uri.request_uri)
- else
- Net::HTTP::Post.new(uri.request_uri)
- end
- end
-
- # uri is an absolute URI
- def fetch_page(uri, request, cur_page=current_page(), request_data=[])
- raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
-
- log.info("#{ request.class }: #{ request.path }") if log
-
- page = nil
-
- cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= {
- :connection => nil,
- :keep_alive_options => {},
- })
- http_obj = cache_obj[:connection]
- if http_obj.nil? || ! http_obj.started?
- http_obj = cache_obj[:connection] =
- Net::HTTP.new( uri.host,
- uri.port,
- @proxy_addr,
- @proxy_port,
- @proxy_user,
- @proxy_pass
- )
- cache_obj[:keep_alive_options] = {}
-
- # Specify timeouts if given
- http_obj.open_timeout = @open_timeout if @open_timeout
- http_obj.read_timeout = @read_timeout if @read_timeout
- end
-
- if uri.scheme == 'https' && ! http_obj.started?
- http_obj.use_ssl = true
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE
- if @ca_file
- http_obj.ca_file = @ca_file
- http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER
- end
- if @cert && @key
- http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert))
- http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass)
- end
- end
-
- # If we're keeping connections alive and the last request time is too
- # long ago, stop the connection. Or, if the max requests left is 1,
- # reset the connection.
- if @keep_alive && http_obj.started?
- opts = cache_obj[:keep_alive_options]
- if((opts[:timeout] &&
- Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) ||
- opts[:max] && opts[:max].to_i == 1)
-
- log.debug('Finishing stale connection') if log
- http_obj.finish
-
- end
- end
-
- http_obj.start unless http_obj.started?
-
- request = set_headers(uri, request, cur_page)
-
- # Log specified headers for the request
- if log
- request.each_header do |k, v|
- log.debug("request-header: #{ k } => #{ v }")
- end
- end
-
- cache_obj[:last_request_time] = Time.now.to_i
-
- # Send the request
- response = http_obj.request(request, *request_data) {|response|
-
- body = StringIO.new
- total = 0
- response.read_body { |part|
- total += part.length
- body.write(part)
- log.debug("Read #{total} bytes") if log
- }
- body.rewind
-
- response.each_header { |k,v|
- log.debug("response-header: #{ k } => #{ v }")
- } if log
-
- content_type = nil
- unless response['Content-Type'].nil?
- data = response['Content-Type'].match(/^([^;]*)/)
- content_type = data[1].downcase unless data.nil?
- end
-
- response_body =
- if encoding = response['Content-Encoding']
- case encoding.downcase
- when 'gzip'
- log.debug('gunzip body') if log
- Zlib::GzipReader.new(body).read
- when 'x-gzip'
- body.read
- else
- raise 'Unsupported content encoding'
- end
- else
- body.read
- end
-
- # Find our pluggable parser
- page = @pluggable_parser.parser(content_type).new(
- uri,
- response,
- response_body,
- response.code
- ) { |parser|
- parser.mech = self if parser.respond_to? :mech=
- if parser.respond_to?(:watch_for_set=) && @watch_for_set
- parser.watch_for_set = @watch_for_set
- end
- }
-
- }
-
- # If the server sends back keep alive options, save them
- if keep_alive_info = response['keep-alive']
- keep_alive_info.split(/,\s*/).each do |option|
- k, v = option.split(/=/)
- cache_obj[:keep_alive_options] ||= {}
- cache_obj[:keep_alive_options][k.intern] = v
- end
- end
-
- (response.get_fields('Set-Cookie')||[]).each do |cookie|
- Cookie::parse(uri, cookie, log) { |c|
- log.debug("saved cookie: #{c}") if log
- @cookie_jar.add(uri, c)
- }
- end
-
- log.info("status: #{ page.code }") if log
-
- res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
-
- if follow_meta_refresh && page.respond_to?(:meta) &&
- (redirect = page.meta.first)
- return redirect.click
- end
-
- return page if res_klass <= Net::HTTPSuccess
-
- if res_klass == Net::HTTPNotModified
- log.debug("Got cached page") if log
- return visited_page(uri)
- elsif res_klass <= Net::HTTPRedirection
- return page unless follow_redirect?
- log.info("follow redirect to: #{ response['Location'] }") if log
- from_uri = page.uri
- abs_uri = to_absolute_uri(response['Location'].to_s, page)
- page = fetch_page(abs_uri, fetch_request(abs_uri), page)
- @history.push(page, from_uri)
- return page
- elsif res_klass <= Net::HTTPUnauthorized
- raise ResponseCodeError.new(page) unless @user || @password
- raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host)
- if response['www-authenticate'] =~ /Digest/i
- @auth_hash[uri.host] = :digest
- @digest = response['www-authenticate']
- else
- @auth_hash[uri.host] = :basic
- end
- return fetch_page( uri,
- fetch_request(uri, request.method.downcase.to_sym),
- cur_page,
- request_data
- )
- end
-
- raise ResponseCodeError.new(page), "Unhandled response", caller
- end
-
- def self.build_query_string(parameters)
- vals = []
- parameters.each { |k,v|
- next if k.nil?
- vals <<
- [WEBrick::HTTPUtils.escape_form(k),
- WEBrick::HTTPUtils.escape_form(v.to_s)].join("=")
- }
-
- vals.join("&")
- end
-
- def add_to_history(page)
- @history.push(page, to_absolute_uri(page.uri))
- end
-
- # :stopdoc:
- class Util
- def self.html_unescape(s)
- return s unless s
- s.gsub(/&(\w+|#[0-9]+);/) { |match|
- number = case match
- when /&(\w+);/
- Hpricot::NamedCharacters[$1]
- when /&#([0-9]+);/
- $1.to_i
- end
-
- number ? ([number].pack('U') rescue match) : match
- }
- end
- end
- # :startdoc:
-end
-
-end # module WWW
+require 'www/mechanize'