lib/mechanize.rb in mechanize-0.6.4 vs lib/mechanize.rb in mechanize-0.6.5

- old
+ new

@@ -9,19 +9,28 @@ # required due to the missing get_fields method in Ruby 1.8.2 unless RUBY_VERSION > "1.8.2" $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides") end + require 'net/http' require 'net/https' +# Monkey patch for ruby 1.8.4 +unless RUBY_VERSION > "1.8.4" +module Net # :nodoc: + class HTTPResponse # :nodoc: + CODE_TO_OBJ['500'] = HTTPInternalServerError + end +end +end + require 'uri' require 'webrick/httputils' require 'zlib' require 'stringio' require 'mechanize/hpricot' -require 'mechanize/mech_version' require 'mechanize/cookie' require 'mechanize/errors' require 'mechanize/pluggable_parsers' require 'mechanize/form' require 'mechanize/form_elements' @@ -48,19 +57,27 @@ # search_form = page.forms.name("f").first # search_form.fields.name("q").value = "Hello" # search_results = agent.submit(search_form) # puts search_results.body class Mechanize + ## + # The version of Mechanize you are using. + + VERSION = '0.6.5' + + ## + # User Agent aliases AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', + 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', - 'Mechanize' => "WWW-Mechanize/#{Version} (http://rubyforge.org/projects/mechanize/)" + 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)" } attr_accessor :cookie_jar attr_accessor :log attr_accessor :max_history @@ -217,13 +234,18 @@ @history.last end # Returns whether or not a url has been visited def visited?(url) + ! visited_page(url).nil? + end + + # Returns a visited page for the url passed in, otherwise nil + def visited_page(url) url = url.uri if url.respond_to? :uri uri = to_absolute_uri(url).to_s - ! @history.find { |h| h.uri.to_s == uri }.nil? + @history.reverse.find { |h| h.uri.to_s == uri } end # Runs given block, then resets the page history as it was before. self is # given as a parameter to the block. Returns the value of the block. def transact @@ -260,25 +282,41 @@ end # Add User-Agent header to request request.add_field('User-Agent', @user_agent) if @user_agent + # Add If-Modified-Since if page is in history + if( (page = visited_page(uri)) && page.response['Last-Modified'] ) + request.add_field('If-Modified-Since', page.response['Last-Modified']) + end + request.basic_auth(@user, @password) if @user || @password request end private def to_absolute_uri(url, cur_page=current_page()) - url = URI.parse( - URI.unescape(Util.html_unescape(url.to_s.strip)).gsub(/ /, '%20') - ) unless url.is_a? URI + unless url.is_a? URI + url = url.to_s.strip + url = URI.parse( + Util.html_unescape( + url.split(/%[0-9A-Fa-f]{2}/).zip( + url.scan(/%[0-9A-Fa-f]{2}/) + ).map { |x,y| + "#{URI.escape(x)}#{y}" + }.join('') + ).gsub(/%23/, '#') + ) + end # construct an absolute uri if url.relative? raise 'no history. please specify an absolute URL' unless cur_page.uri url = cur_page.uri + url + # Strip initial "/.." bits from the path + url.path.sub!(/^(\/\.\.)+(?=\/)/, '') end return url end @@ -401,25 +439,27 @@ page = @pluggable_parser.parser(content_type).new( uri, response, response_body, response.code - ) + ) { |parser| + parser.mech = self if parser.respond_to? :mech= + if parser.respond_to?(:watch_for_set=) && @watch_for_set + parser.watch_for_set = @watch_for_set + end + } - page.mech = self if page.respond_to? :mech= - log.info("status: #{ page.code }") if log - if page.respond_to? :watch_for_set - page.watch_for_set = @watch_for_set - end - res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s] return page if res_klass <= Net::HTTPSuccess - if res_klass <= Net::HTTPRedirection + if res_klass == Net::HTTPNotModified + log.debug("Got cached page") if log + return visited_page(uri) + elsif res_klass <= Net::HTTPRedirection return page unless follow_redirect? log.info("follow redirect to: #{ response['Location'] }") if log abs_uri = to_absolute_uri(response['Location'].to_s, page) request = fetch_request(abs_uri) return fetch_page(abs_uri, request, page) @@ -442,19 +482,33 @@ vals.join("&") end def add_to_history(page) @history.push(page) - if @max_history and @history.size > @max_history - # keep only the last @max_history entries - @history = @history[@history.size - @max_history, @max_history] + if @max_history and @history.length > @max_history + while @history.length > @max_history + @history[0] = nil + @history.shift + end end end + # :stopdoc: class Util def self.html_unescape(s) - s.to_s.gsub(/&amp;/, "&").gsub(/&quot;/, '"').gsub(/&gt;/, ">").gsub(/&lt;/, "<") + return s unless s + s.gsub(/&(\w+|#[0-9]+);/) { |match| + number = case match + when /&(\w+);/ + Hpricot::NamedCharacters[$1] + when /&#([0-9]+);/ + $1.to_i + end + + number ? (number.chr rescue match) : match + } end end + # :startdoc: end end # module WWW