lib/mechanize.rb in tenderlove-mechanize-0.9.3.20090623142847 vs lib/mechanize.rb in tenderlove-mechanize-0.9.3.20090911221705

- old
+ new

@@ -1,7 +1,621 @@ -# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de) -# Copyright (c) 2007 by Aaron Patterson (aaronp@rubyforge.org) +require 'net/http' +require 'net/https' +require 'uri' +require 'webrick/httputils' +require 'zlib' +require 'stringio' +require 'digest/md5' +require 'fileutils' +require 'nokogiri' +require 'forwardable' +require 'iconv' +require 'nkf' + +require 'mechanize/util' +require 'mechanize/content_type_error' +require 'mechanize/response_code_error' +require 'mechanize/unsupported_scheme_error' +require 'mechanize/redirect_limit_reached_error' +require 'mechanize/redirect_not_get_or_head_error' +require 'mechanize/cookie' +require 'mechanize/cookie_jar' +require 'mechanize/history' +require 'mechanize/form' +require 'mechanize/pluggable_parsers' +require 'mechanize/file_response' +require 'mechanize/inspect' +require 'mechanize/chain' +require 'mechanize/monkey_patch' + +# = Synopsis +# The Mechanize library is used for automating interaction with a website. It +# can follow links, and submit forms. Form fields can be populated and +# submitted. A history of URL's is maintained and can be queried. # -# Please see the LICENSE file for licensing. +# == Example +# require 'rubygems' +# require 'mechanize' +# require 'logger' +# +# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } +# agent.user_agent_alias = 'Mac Safari' +# page = agent.get("http://www.google.com/") +# search_form = page.form_with(:name => "f") +# search_form.field_with(:name => "q").value = "Hello" +# search_results = agent.submit(search_form) +# puts search_results.body +class Mechanize + ## + # The version of Mechanize you are using. + VERSION = '0.9.3' + ## + # User Agent aliases + AGENT_ALIASES = { + 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', + 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', + 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', + 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', + 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', + 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', + 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', + 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', + 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3', + 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" + } -require 'www/mechanize' + attr_accessor :cookie_jar + attr_accessor :open_timeout, :read_timeout + attr_accessor :user_agent + attr_accessor :watch_for_set + attr_accessor :ca_file + attr_accessor :key + attr_accessor :cert + attr_accessor :pass + attr_accessor :redirect_ok + attr_accessor :keep_alive_time + attr_accessor :keep_alive + attr_accessor :conditional_requests + attr_accessor :follow_meta_refresh + attr_accessor :verify_callback + attr_accessor :history_added + attr_accessor :scheme_handlers + attr_accessor :redirection_limit + + # A hash of custom request headers + attr_accessor :request_headers + + # The HTML parser to be used when parsing documents + attr_accessor :html_parser + + attr_reader :history + attr_reader :pluggable_parser + + alias :follow_redirect? :redirect_ok + + @html_parser = Nokogiri::HTML + class << self; attr_accessor :html_parser, :log end + + def initialize + # attr_accessors + @cookie_jar = CookieJar.new + @log = nil + @open_timeout = nil + @read_timeout = nil + @user_agent = AGENT_ALIASES['Mechanize'] + @watch_for_set = nil + @history_added = nil + @ca_file = nil # OpenSSL server certificate file + + # callback for OpenSSL errors while verifying the server certificate + # chain, can be used for debugging or to ignore errors by always + # returning _true_ + @verify_callback = nil + @cert = nil # OpenSSL Certificate + @key = nil # OpenSSL Private Key + @pass = nil # OpenSSL Password + @redirect_ok = true # Should we follow redirects? + + # attr_readers + @history = Mechanize::History.new + @pluggable_parser = PluggableParser.new + + # Auth variables + @user = nil # Auth User + @password = nil # Auth Password + @digest = nil # DigestAuth Digest + @auth_hash = {} # Keep track of urls for sending auth + @request_headers= {} # A hash of request headers to be used + + # Proxy settings + @proxy_addr = nil + @proxy_pass = nil + @proxy_port = nil + @proxy_user = nil + + @conditional_requests = true + + @follow_meta_refresh = false + @redirection_limit = 20 + + # Connection Cache & Keep alive + @connection_cache = {} + @keep_alive_time = 300 + @keep_alive = true + + @scheme_handlers = Hash.new { |h,k| + h[k] = lambda { |link, page| + raise UnsupportedSchemeError.new(k) + } + } + @scheme_handlers['http'] = lambda { |link, page| link } + @scheme_handlers['https'] = @scheme_handlers['http'] + @scheme_handlers['relative'] = @scheme_handlers['http'] + @scheme_handlers['file'] = @scheme_handlers['http'] + + @pre_connect_hook = Chain::PreConnectHook.new + @post_connect_hook = Chain::PostConnectHook.new + + @html_parser = self.class.html_parser + + yield self if block_given? + end + + def max_history=(length); @history.max_size = length end + def max_history; @history.max_size end + def log=(l); self.class.log = l end + def log; self.class.log end + + def pre_connect_hooks + @pre_connect_hook.hooks + end + + def post_connect_hooks + @post_connect_hook.hooks + end + + # Sets the proxy address, port, user, and password + # +addr+ should be a host, with no "http://" + def set_proxy(addr, port, user = nil, pass = nil) + @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass + end + + # Set the user agent for the Mechanize object. + # See AGENT_ALIASES + def user_agent_alias=(al) + self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias") + end + + # Returns a list of cookies stored in the cookie jar. + def cookies + @cookie_jar.to_a + end + + # Sets the user and password to be used for authentication. + def auth(user, password) + @user = user + @password = password + end + alias :basic_auth :auth + + # Fetches the URL passed in and returns a page. + def get(options, parameters = [], referer = nil) + unless options.is_a? Hash + url = options + unless parameters.respond_to?(:each) # FIXME: Remove this in 0.8.0 + referer = parameters + parameters = [] + end + else + raise ArgumentError.new("url must be specified") unless url = options[:url] + parameters = options[:params] || [] + referer = options[:referer] + headers = options[:headers] + end + + unless referer + if url.to_s =~ /^http/ + referer = Page.new(nil, {'content-type'=>'text/html'}) + else + referer = current_page || Page.new(nil, {'content-type'=>'text/html'}) + end + end + + # FIXME: Huge hack so that using a URI as a referer works. I need to + # refactor everything to pass around URIs but still support + # Mechanize::Page#base + unless referer.is_a?(Mechanize::File) + referer = referer.is_a?(String) ? + Page.new(URI.parse(referer), {'content-type' => 'text/html'}) : + Page.new(referer, {'content-type' => 'text/html'}) + end + + # fetch the page + page = fetch_page( :uri => url, + :referer => referer, + :headers => headers || {}, + :params => parameters + ) + add_to_history(page) + yield page if block_given? + page + end + + #### + # PUT to +url+ with +entity+, and setting +options+: + # + # put('http://tenderlovemaking.com/', 'new content', :headers => {'Content-Type' => 'text/plain'}) + # + def put(url, entity, options = {}) + request_with_entity(:put, url, entity, options) + end + + #### + # DELETE to +url+ with +query_params+, and setting +options+: + # + # delete('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {}) + # + def delete(url, query_params = {}, options = {}) + page = head(url, query_params, options.merge({:verb => :delete})) + add_to_history(page) + page + end + + #### + # HEAD to +url+ with +query_params+, and setting +options+: + # + # head('http://tenderlovemaking.com/', {'q' => 'foo'}, :headers => {}) + # + def head(url, query_params = {}, options = {}) + options = { + :uri => url, + :headers => {}, + :params => query_params, + :verb => :head + }.merge(options) + # fetch the page + page = fetch_page(options) + yield page if block_given? + page + end + + # Fetch a file and return the contents of the file. + def get_file(url) + get(url).body + end + + # Clicks the Mechanize::Link object passed in and returns the + # page fetched. + def click(link) + referer = link.page rescue referer = nil + href = link.respond_to?(:href) ? link.href : + (link['href'] || link['src']) + get(:url => href, :referer => (referer || current_page())) + end + + # Equivalent to the browser back button. Returns the most recent page + # visited. + def back + @history.pop + end + + # Posts to the given URL with the request entity. The request + # entity is specified by either a string, or a list of key-value + # pairs represented by a hash or an array of arrays. + # + # Examples: + # agent.post('http://example.com/', "foo" => "bar") + # + # agent.post('http://example.com/', [ ["foo", "bar"] ]) + # + # agent.post('http://example.com/', "<message>hello</message>", 'Content-Type' => 'application/xml') + def post(url, query={}, headers={}) + if query.is_a?(String) + return request_with_entity(:post, url, query, :headers => headers) + end + node = {} + # Create a fake form + class << node + def search(*args); []; end + end + node['method'] = 'POST' + node['enctype'] = 'application/x-www-form-urlencoded' + + form = Form.new(node) + query.each { |k,v| + if v.is_a?(IO) + form.enctype = 'multipart/form-data' + ul = Form::FileUpload.new(k.to_s,::File.basename(v.path)) + ul.file_data = v.read + form.file_uploads << ul + else + form.fields << Form::Field.new(k.to_s,v) + end + } + post_form(url, form, headers) + end + + # Submit a form with an optional button. + # Without a button: + # page = agent.get('http://example.com') + # agent.submit(page.forms.first) + # With a button + # agent.submit(page.forms.first, page.forms.first.buttons.first) + def submit(form, button=nil, headers={}) + form.add_button_to_query(button) if button + case form.method.upcase + when 'POST' + post_form(form.action, form, headers) + when 'GET' + get( :url => form.action.gsub(/\?[^\?]*$/, ''), + :params => form.build_query, + :headers => headers, + :referer => form.page + ) + else + raise "unsupported method: #{form.method.upcase}" + end + end + + def request_with_entity(verb, url, entity, options={}) + cur_page = current_page || Page.new( nil, {'content-type'=>'text/html'}) + + options = { + :uri => url, + :referer => cur_page, + :headers => {}, + }.update(options) + + headers = { + 'Content-Type' => 'application/octet-stream', + 'Content-Length' => entity.size.to_s, + }.update(options[:headers]) + + options.update({ + :verb => verb, + :params => [entity], + :headers => headers, + }) + + page = fetch_page(options) + add_to_history(page) + page + end + + # Returns the current page loaded by Mechanize + def current_page + @history.last + end + + # Returns whether or not a url has been visited + def visited?(url) + ! visited_page(url).nil? + end + + # Returns a visited page for the url passed in, otherwise nil + def visited_page(url) + if url.respond_to? :href + url = url.href + end + @history.visited_page(resolve(url)) + end + + # Runs given block, then resets the page history as it was before. self is + # given as a parameter to the block. Returns the value of the block. + def transact + history_backup = @history.dup + begin + yield self + ensure + @history = history_backup + end + end + + alias :page :current_page + + private + + def resolve(url, referer = current_page()) + hash = { :uri => url, :referer => referer } + chain = Chain.new([ + Chain::URIResolver.new(@scheme_handlers) + ]).handle(hash) + hash[:uri].to_s + end + + def post_form(url, form, headers = {}) + cur_page = form.page || current_page || + Page.new( nil, {'content-type'=>'text/html'}) + + request_data = form.request_data + + log.debug("query: #{ request_data.inspect }") if log + + # fetch the page + page = fetch_page( :uri => url, + :referer => cur_page, + :verb => :post, + :params => [request_data], + :headers => { + 'Content-Type' => form.enctype, + 'Content-Length' => request_data.size.to_s, + }.merge(headers)) + add_to_history(page) + page + end + + # uri is an absolute URI + def fetch_page(params) + options = { + :request => nil, + :response => nil, + :connection => nil, + :referer => current_page(), + :uri => nil, + :verb => :get, + :agent => self, + :redirects => 0, + :params => [], + :headers => {}, + }.merge(params) + + before_connect = Chain.new([ + Chain::URIResolver.new(@scheme_handlers), + Chain::ParameterResolver.new, + Chain::RequestResolver.new, + Chain::ConnectionResolver.new( + @connection_cache, + @keep_alive, + @proxy_addr, + @proxy_port, + @proxy_user, + @proxy_pass + ), + Chain::SSLResolver.new(@ca_file, @verify_callback, @cert, @key, @pass), + Chain::AuthHeaders.new(@auth_hash, @user, @password, @digest), + Chain::HeaderResolver.new( + @keep_alive, + @keep_alive_time, + @cookie_jar, + @user_agent, + @request_headers + ), + Chain::CustomHeaders.new, + @pre_connect_hook, + ]) + before_connect.handle(options) + + uri = options[:uri] + request = options[:request] + cur_page = options[:referer] + request_data = options[:params] + redirects = options[:redirects] + http_obj = options[:connection] + + # Add If-Modified-Since if page is in history + if( (page = visited_page(uri)) && page.response['Last-Modified'] ) + request['If-Modified-Since'] = page.response['Last-Modified'] + end if(@conditional_requests) + + # Specify timeouts if given + http_obj.open_timeout = @open_timeout if @open_timeout + http_obj.read_timeout = @read_timeout if @read_timeout + http_obj.start unless http_obj.started? + + # Log specified headers for the request + log.info("#{ request.class }: #{ request.path }") if log + request.each_header do |k, v| + log.debug("request-header: #{ k } => #{ v }") + end if log + + # Send the request + attempts = 0 + begin + response = http_obj.request(request, *request_data) { |r| + connection_chain = Chain.new([ + Chain::ResponseReader.new(r), + Chain::BodyDecodingHandler.new, + ]) + connection_chain.handle(options) + } + rescue EOFError, Errno::ECONNRESET, Errno::EPIPE => x + log.error("Rescuing EOF error") if log + http_obj.finish + raise x if attempts >= 2 + request.body = nil + http_obj.start + attempts += 1 + retry + end + + after_connect = Chain.new([ + @post_connect_hook, + Chain::ResponseBodyParser.new(@pluggable_parser, @watch_for_set), + Chain::ResponseHeaderHandler.new(@cookie_jar, @connection_cache), + ]) + after_connect.handle(options) + + res_klass = options[:res_klass] + response_body = options[:response_body] + page = options[:page] + + log.info("status: #{ page.code }") if log + + if follow_meta_refresh + redirect_uri = nil + referer = page + if (page.respond_to?(:meta) && (redirect = page.meta.first)) + redirect_uri = redirect.uri.to_s + sleep redirect.node['delay'].to_f + referer = Page.new(nil, {'content-type'=>'text/html'}) + elsif refresh = response['refresh'] + delay, redirect_uri = Page::Meta.parse(refresh, uri) + raise StandardError, "Invalid refresh http header" unless delay + if redirects + 1 > redirection_limit + raise RedirectLimitReachedError.new(page, redirects) + end + sleep delay.to_f + end + if redirect_uri + @history.push(page, page.uri) + return fetch_page( + :uri => redirect_uri, + :referer => referer, + :params => [], + :verb => :get, + :redirects => redirects + 1 + ) + end + end + + return page if res_klass <= Net::HTTPSuccess + + if res_klass == Net::HTTPNotModified + log.debug("Got cached page") if log + return visited_page(uri) || page + elsif res_klass <= Net::HTTPRedirection + return page unless follow_redirect? + log.info("follow redirect to: #{ response['Location'] }") if log + from_uri = page.uri + raise RedirectLimitReachedError.new(page, redirects) if redirects + 1 > redirection_limit + redirect_verb = options[:verb] == :head ? :head : :get + page = fetch_page( :uri => response['Location'].to_s, + :referer => page, + :params => [], + :verb => redirect_verb, + :redirects => redirects + 1 + ) + @history.push(page, from_uri) + return page + elsif res_klass <= Net::HTTPUnauthorized + raise ResponseCodeError.new(page) unless @user || @password + raise ResponseCodeError.new(page) if @auth_hash.has_key?(uri.host) + if response['www-authenticate'] =~ /Digest/i + @auth_hash[uri.host] = :digest + if response['server'] =~ /Microsoft-IIS/ + @auth_hash[uri.host] = :iis_digest + end + @digest = response['www-authenticate'] + else + @auth_hash[uri.host] = :basic + end + return fetch_page( :uri => uri, + :referer => cur_page, + :verb => request.method.downcase.to_sym, + :params => request_data, + :headers => options[:headers] + ) + end + + raise ResponseCodeError.new(page), "Unhandled response", caller + end + + def add_to_history(page) + @history.push(page, resolve(page.uri)) + history_added.call(page) if history_added + end +end + +module WWW; end +WWW::Mechanize = ::Mechanize +