# Original Code: # Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de) # # New Code: # Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org) # # Please see the LICENSE file for licensing. # # required due to the missing get_fields method in Ruby 1.8.2 unless RUBY_VERSION > "1.8.2" $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides") end require 'net/http' require 'net/https' # Monkey patch for ruby 1.8.4 unless RUBY_VERSION > "1.8.4" module Net # :nodoc: class HTTPResponse # :nodoc: CODE_TO_OBJ['500'] = HTTPInternalServerError end end end require 'uri' require 'webrick/httputils' require 'zlib' require 'stringio' require 'digest/md5' require 'mechanize/monkey_patch' require 'mechanize/cookie' require 'mechanize/errors' require 'mechanize/pluggable_parsers' require 'mechanize/form' require 'mechanize/form_elements' require 'mechanize/history' require 'mechanize/list' require 'mechanize/page' require 'mechanize/page_elements' require 'mechanize/inspect' module WWW # = Synopsis # The Mechanize library is used for automating interaction with a website. It # can follow links, and submit forms. Form fields can be populated and # submitted. A history of URL's is maintained and can be queried. # # == Example # require 'rubygems' # require 'mechanize' # require 'logger' # # agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } # agent.user_agent_alias = 'Mac Safari' # page = agent.get("http://www.google.com/") # search_form = page.forms.name("f").first # search_form.fields.name("q").value = "Hello" # search_results = agent.submit(search_form) # puts search_results.body class Mechanize ## # The version of Mechanize you are using. VERSION = '0.6.9' ## # User Agent aliases AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" } attr_accessor :cookie_jar attr_accessor :log attr_accessor :open_timeout, :read_timeout attr_accessor :user_agent attr_accessor :watch_for_set attr_accessor :ca_file attr_accessor :key attr_accessor :cert attr_accessor :pass attr_accessor :redirect_ok attr_accessor :keep_alive_time attr_accessor :keep_alive attr_accessor :conditional_requests attr_accessor :follow_meta_refresh attr_reader :history attr_reader :pluggable_parser alias :follow_redirect? :redirect_ok @@nonce_count = -1 CNONCE = Digest::MD5.hexdigest("%x" % (Time.now.to_i + rand(65535))) def initialize # attr_accessors @cookie_jar = CookieJar.new @log = nil @open_timeout = nil @read_timeout = nil @user_agent = AGENT_ALIASES['Mechanize'] @watch_for_set = nil @ca_file = nil @cert = nil # OpenSSL Certificate @key = nil # OpenSSL Private Key @pass = nil # OpenSSL Password @redirect_ok = true # Should we follow redirects? # attr_readers @history = WWW::Mechanize::History.new @pluggable_parser = PluggableParser.new # Auth variables @user = nil # Auth User @password = nil # Auth Password @digest = nil # DigestAuth Digest @auth_hash = {} # Keep track of urls for sending auth # Proxy settings @proxy_addr = nil @proxy_pass = nil @proxy_port = nil @proxy_user = nil @conditional_requests = true @follow_meta_refresh = false # Connection Cache & Keep alive @connection_cache = {} @keep_alive_time = 300 @keep_alive = true yield self if block_given? end def max_history=(length); @history.max_size = length; end def max_history; @history.max_size; end # Sets the proxy address, port, user, and password def set_proxy(addr, port, user = nil, pass = nil) @proxy_addr, @proxy_port, @proxy_user, @proxy_pass = addr, port, user, pass end # Set the user agent for the Mechanize object. # See AGENT_ALIASES def user_agent_alias=(al) self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias") end # Returns a list of cookies stored in the cookie jar. def cookies @cookie_jar.to_a end # Sets the user and password to be used for basic authentication. def basic_auth(user, password) auth(user, password) end def auth(user, password) @user = user @password = password end # Fetches the URL passed in and returns a page. def get(url, referer=nil, &block) cur_page = referer || current_page || Page.new( nil, {'content-type'=>'text/html'}) # fetch the page abs_uri = to_absolute_uri(url, cur_page) request = fetch_request(abs_uri) page = fetch_page(abs_uri, request, cur_page, &block) add_to_history(page) page end # Fetch a file and return the contents of the file. def get_file(url) get(url).body end # Clicks the WWW::Mechanize::Link object passed in and returns the # page fetched. def click(link) referer = begin link.page rescue nil end uri = to_absolute_uri( link.attributes['href'] || link.attributes['src'] || link.href, referer || current_page() ) get(uri, referer) end # Equivalent to the browser back button. Returns the most recent page # visited. def back @history.pop end # Posts to the given URL wht the query parameters passed in. Query # parameters can be passed as a hash, or as an array of arrays. # Example: # agent.post('http://example.com/', "foo" => "bar") # or # agent.post('http://example.com/', [ ["foo", "bar"] ]) def post(url, query={}) node = Hpricot::Elem.new(Hpricot::STag.new('form')) node['method'] = 'POST' node['enctype'] = 'application/x-www-form-urlencoded' form = Form.new(node) query.each { |k,v| form.fields << Field.new(k,v) } post_form(url, form) end # Submit a form with an optional button. # Without a button: # page = agent.get('http://example.com') # agent.submit(page.forms.first) # With a button # agent.submit(page.forms.first, page.forms.first.buttons.first) def submit(form, button=nil) form.add_button_to_query(button) if button uri = to_absolute_uri(form.action) case form.method.upcase when 'POST' post_form(uri, form) when 'GET' uri.query = WWW::Mechanize.build_query_string(form.build_query) get(uri) else raise "unsupported method: #{form.method.upcase}" end end # Returns the current page loaded by Mechanize def current_page @history.last end # Returns whether or not a url has been visited def visited?(url) ! visited_page(url).nil? end # Returns a visited page for the url passed in, otherwise nil def visited_page(url) if url.respond_to? :href url = url.href end @history.visited_page(to_absolute_uri(url)) end # Runs given block, then resets the page history as it was before. self is # given as a parameter to the block. Returns the value of the block. def transact history_backup = @history.dup begin yield self ensure @history = history_backup end end alias :page :current_page protected def set_headers(uri, request, cur_page) if @keep_alive request.add_field('Connection', 'keep-alive') request.add_field('Keep-Alive', keep_alive_time.to_s) else request.add_field('Connection', 'close') end request.add_field('Accept-Encoding', 'gzip,identity') request.add_field('Accept-Language', 'en-us,en;q0.5') request.add_field('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7') unless @cookie_jar.empty?(uri) cookies = @cookie_jar.cookies(uri) cookie = cookies.length > 0 ? cookies.join("; ") : nil if log cookies.each do |c| log.debug("using cookie: #{c}") end end request.add_field('Cookie', cookie) end # Add Referer header to request unless cur_page.uri.nil? request.add_field('Referer', cur_page.uri.to_s) end # Add User-Agent header to request request.add_field('User-Agent', @user_agent) if @user_agent # Add If-Modified-Since if page is in history if @conditional_requests if( (page = visited_page(uri)) && page.response['Last-Modified'] ) request.add_field('If-Modified-Since', page.response['Last-Modified']) end end if( @auth_hash[uri.to_s] ) raise 'Please provide username and password' unless @user || @password case @auth_hash[uri.to_s] when :basic request.basic_auth(@user, @password) when :digest @digest_response ||= nil @digest_response = self.gen_auth_header(uri, @digest) if @digest request.add_field('Authorization', @digest_response) if @digest_response end end request end def gen_auth_header(uri, auth_header, is_IIS = false) @@nonce_count += 1 user = @digest_user password = @digest_password auth_header =~ /^(\w+) (.*)/ params = {} $2.gsub(/(\w+)="(.*?)"/) { params[$1] = $2 } a_1 = "#{@user}:#{params['realm']}:#{@password}" a_2 = "GET:#{uri.path}" request_digest = '' request_digest << Digest::MD5.hexdigest(a_1) request_digest << ':' << params['nonce'] request_digest << ':' << ('%08x' % @@nonce_count) request_digest << ':' << CNONCE request_digest << ':' << params['qop'] request_digest << ':' << Digest::MD5.hexdigest(a_2) header = '' header << "Digest username=\"#{@user}\", " header << "realm=\"#{params['realm']}\", " if is_IIS then header << "qop=\"#{params['qop']}\", " else header << "qop=#{params['qop']}, " end header << "uri=\"#{uri.path}\", " header << "nonce=\"#{params['nonce']}\", " header << "nc=#{'%08x' % @@nonce_count}, " header << "cnonce=\"#{CNONCE}\", " header << "response=\"#{Digest::MD5.hexdigest(request_digest)}\"" return header end private def to_absolute_uri(url, cur_page=current_page()) unless url.is_a? URI url = url.to_s.strip.gsub(/[^#{0.chr}-#{125.chr}]/) { |match| sprintf('%%%X', match.unpack($KCODE == 'UTF8' ? 'U' : 'c')[0]) } url = URI.parse( Util.html_unescape( url.split(/%[0-9A-Fa-f]{2}/).zip( url.scan(/%[0-9A-Fa-f]{2}/) ).map { |x,y| "#{URI.escape(x)}#{y}" }.join('').gsub(/%23/, '#') ) ) end # construct an absolute uri if url.relative? raise 'no history. please specify an absolute URL' unless cur_page.uri url = cur_page.uri + url # Strip initial "/.." bits from the path url.path.sub!(/^(\/\.\.)+(?=\/)/, '') end return url end def post_form(url, form) cur_page = form.page || current_page || Page.new( nil, {'content-type'=>'text/html'}) request_data = form.request_data abs_url = to_absolute_uri(url, cur_page) request = fetch_request(abs_url, :post) request.add_field('Content-Type', form.enctype) request.add_field('Content-Length', request_data.size.to_s) log.debug("query: #{ request_data.inspect }") if log # fetch the page page = fetch_page(abs_url, request, cur_page, [request_data]) add_to_history(page) page end # Creates a new request object based on the scheme and type def fetch_request(uri, type = :get) raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme) if type == :get Net::HTTP::Get.new(uri.request_uri) else Net::HTTP::Post.new(uri.request_uri) end end # uri is an absolute URI def fetch_page(uri, request, cur_page=current_page(), request_data=[]) raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme) log.info("#{ request.class }: #{ request.path }") if log page = nil cache_obj = (@connection_cache["#{uri.host}:#{uri.port}"] ||= { :connection => nil, :keep_alive_options => {}, }) http_obj = cache_obj[:connection] if http_obj.nil? || ! http_obj.started? http_obj = cache_obj[:connection] = Net::HTTP.new( uri.host, uri.port, @proxy_addr, @proxy_port, @proxy_user, @proxy_pass ) cache_obj[:keep_alive_options] = {} # Specify timeouts if given http_obj.open_timeout = @open_timeout if @open_timeout http_obj.read_timeout = @read_timeout if @read_timeout end if uri.scheme == 'https' && ! http_obj.started? http_obj.use_ssl = true http_obj.verify_mode = OpenSSL::SSL::VERIFY_NONE if @ca_file http_obj.ca_file = @ca_file http_obj.verify_mode = OpenSSL::SSL::VERIFY_PEER end if @cert && @key http_obj.cert = OpenSSL::X509::Certificate.new(::File.read(@cert)) http_obj.key = OpenSSL::PKey::RSA.new(::File.read(@key), @pass) end end # If we're keeping connections alive and the last request time is too # long ago, stop the connection. Or, if the max requests left is 1, # reset the connection. if @keep_alive && http_obj.started? opts = cache_obj[:keep_alive_options] if((opts[:timeout] && Time.now.to_i - cache_obj[:last_request_time] > opts[:timeout].to_i) || opts[:max] && opts[:max].to_i == 1) log.debug('Finishing stale connection') if log http_obj.finish end end http_obj.start unless http_obj.started? request = set_headers(uri, request, cur_page) # Log specified headers for the request if log request.each_header do |k, v| log.debug("request-header: #{ k } => #{ v }") end end cache_obj[:last_request_time] = Time.now.to_i # Send the request response = http_obj.request(request, *request_data) {|response| body = StringIO.new total = 0 response.read_body { |part| total += part.length body.write(part) log.debug("Read #{total} bytes") if log } body.rewind response.each_header { |k,v| log.debug("response-header: #{ k } => #{ v }") } if log content_type = nil unless response['Content-Type'].nil? data = response['Content-Type'].match(/^([^;]*)/) content_type = data[1].downcase unless data.nil? end response_body = if encoding = response['Content-Encoding'] case encoding.downcase when 'gzip' log.debug('gunzip body') if log Zlib::GzipReader.new(body).read when 'x-gzip' body.read else raise 'Unsupported content encoding' end else body.read end # Find our pluggable parser page = @pluggable_parser.parser(content_type).new( uri, response, response_body, response.code ) { |parser| parser.mech = self if parser.respond_to? :mech= if parser.respond_to?(:watch_for_set=) && @watch_for_set parser.watch_for_set = @watch_for_set end } } # If the server sends back keep alive options, save them if keep_alive_info = response['keep-alive'] keep_alive_info.split(/,\s*/).each do |option| k, v = option.split(/=/) cache_obj[:keep_alive_options] ||= {} cache_obj[:keep_alive_options][k.intern] = v end end (response.get_fields('Set-Cookie')||[]).each do |cookie| Cookie::parse(uri, cookie, log) { |c| log.debug("saved cookie: #{c}") if log @cookie_jar.add(uri, c) } end log.info("status: #{ page.code }") if log res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s] if follow_meta_refresh && (redirect = page.meta.first) return redirect.click end return page if res_klass <= Net::HTTPSuccess if res_klass == Net::HTTPNotModified log.debug("Got cached page") if log return visited_page(uri) elsif res_klass <= Net::HTTPRedirection return page unless follow_redirect? log.info("follow redirect to: #{ response['Location'] }") if log from_uri = page.uri abs_uri = to_absolute_uri(response['Location'].to_s, page) page = fetch_page(abs_uri, fetch_request(abs_uri), page) @history.push(page, from_uri) return page elsif res_klass <= Net::HTTPUnauthorized if response['www-authenticate'] =~ /Digest/i @auth_hash[uri.to_s] = :digest @digest = response['www-authenticate'] return fetch_page(uri, fetch_request(uri), cur_page, request_data) else @auth_hash[uri.to_s] = :basic return fetch_page(uri, fetch_request(uri), cur_page, request_data) end end raise ResponseCodeError.new(page), "Unhandled response", caller end def self.build_query_string(parameters) vals = [] parameters.each { |k,v| next if k.nil? vals << [WEBrick::HTTPUtils.escape_form(k), WEBrick::HTTPUtils.escape_form(v.to_s)].join("=") } vals.join("&") end def add_to_history(page) @history.push(page) end # :stopdoc: class Util def self.html_unescape(s) return s unless s s.gsub(/&(\w+|#[0-9]+);/) { |match| number = case match when /&(\w+);/ Hpricot::NamedCharacters[$1] when /&#([0-9]+);/ $1.to_i end number ? ([number].pack('U') rescue match) : match } end end # :startdoc: end end # module WWW