lib/mechanize.rb in mechanize-0.6.4 vs lib/mechanize.rb in mechanize-0.6.5
- old
+ new
@@ -9,19 +9,28 @@
# required due to the missing get_fields method in Ruby 1.8.2
unless RUBY_VERSION > "1.8.2"
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
end
+
require 'net/http'
require 'net/https'
+# Monkey patch for ruby 1.8.4
+unless RUBY_VERSION > "1.8.4"
+module Net # :nodoc:
+ class HTTPResponse # :nodoc:
+ CODE_TO_OBJ['500'] = HTTPInternalServerError
+ end
+end
+end
+
require 'uri'
require 'webrick/httputils'
require 'zlib'
require 'stringio'
require 'mechanize/hpricot'
-require 'mechanize/mech_version'
require 'mechanize/cookie'
require 'mechanize/errors'
require 'mechanize/pluggable_parsers'
require 'mechanize/form'
require 'mechanize/form_elements'
@@ -48,19 +57,27 @@
# search_form = page.forms.name("f").first
# search_form.fields.name("q").value = "Hello"
# search_results = agent.submit(search_form)
# puts search_results.body
class Mechanize
+ ##
+ # The version of Mechanize you are using.
+
+ VERSION = '0.6.5'
+
+ ##
+ # User Agent aliases
AGENT_ALIASES = {
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
+ 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/418 (KHTML, like Gecko) Safari/417.9.3',
'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.0.3) Gecko/20060426 Firefox/1.5.0.3',
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
- 'Mechanize' => "WWW-Mechanize/#{Version} (http://rubyforge.org/projects/mechanize/)"
+ 'Mechanize' => "WWW-Mechanize/#{VERSION} (http://rubyforge.org/projects/mechanize/)"
}
attr_accessor :cookie_jar
attr_accessor :log
attr_accessor :max_history
@@ -217,13 +234,18 @@
@history.last
end
# Returns whether or not a url has been visited
def visited?(url)
+ ! visited_page(url).nil?
+ end
+
+ # Returns a visited page for the url passed in, otherwise nil
+ def visited_page(url)
url = url.uri if url.respond_to? :uri
uri = to_absolute_uri(url).to_s
- ! @history.find { |h| h.uri.to_s == uri }.nil?
+ @history.reverse.find { |h| h.uri.to_s == uri }
end
# Runs given block, then resets the page history as it was before. self is
# given as a parameter to the block. Returns the value of the block.
def transact
@@ -260,25 +282,41 @@
end
# Add User-Agent header to request
request.add_field('User-Agent', @user_agent) if @user_agent
+ # Add If-Modified-Since if page is in history
+ if( (page = visited_page(uri)) && page.response['Last-Modified'] )
+ request.add_field('If-Modified-Since', page.response['Last-Modified'])
+ end
+
request.basic_auth(@user, @password) if @user || @password
request
end
private
def to_absolute_uri(url, cur_page=current_page())
- url = URI.parse(
- URI.unescape(Util.html_unescape(url.to_s.strip)).gsub(/ /, '%20')
- ) unless url.is_a? URI
+ unless url.is_a? URI
+ url = url.to_s.strip
+ url = URI.parse(
+ Util.html_unescape(
+ url.split(/%[0-9A-Fa-f]{2}/).zip(
+ url.scan(/%[0-9A-Fa-f]{2}/)
+ ).map { |x,y|
+ "#{URI.escape(x)}#{y}"
+ }.join('')
+ ).gsub(/%23/, '#')
+ )
+ end
# construct an absolute uri
if url.relative?
raise 'no history. please specify an absolute URL' unless cur_page.uri
url = cur_page.uri + url
+ # Strip initial "/.." bits from the path
+ url.path.sub!(/^(\/\.\.)+(?=\/)/, '')
end
return url
end
@@ -401,25 +439,27 @@
page = @pluggable_parser.parser(content_type).new(
uri,
response,
response_body,
response.code
- )
+ ) { |parser|
+ parser.mech = self if parser.respond_to? :mech=
+ if parser.respond_to?(:watch_for_set=) && @watch_for_set
+ parser.watch_for_set = @watch_for_set
+ end
+ }
- page.mech = self if page.respond_to? :mech=
-
log.info("status: #{ page.code }") if log
- if page.respond_to? :watch_for_set
- page.watch_for_set = @watch_for_set
- end
-
res_klass = Net::HTTPResponse::CODE_TO_OBJ[page.code.to_s]
return page if res_klass <= Net::HTTPSuccess
- if res_klass <= Net::HTTPRedirection
+ if res_klass == Net::HTTPNotModified
+ log.debug("Got cached page") if log
+ return visited_page(uri)
+ elsif res_klass <= Net::HTTPRedirection
return page unless follow_redirect?
log.info("follow redirect to: #{ response['Location'] }") if log
abs_uri = to_absolute_uri(response['Location'].to_s, page)
request = fetch_request(abs_uri)
return fetch_page(abs_uri, request, page)
@@ -442,19 +482,33 @@
vals.join("&")
end
def add_to_history(page)
@history.push(page)
- if @max_history and @history.size > @max_history
- # keep only the last @max_history entries
- @history = @history[@history.size - @max_history, @max_history]
+ if @max_history and @history.length > @max_history
+ while @history.length > @max_history
+ @history[0] = nil
+ @history.shift
+ end
end
end
+ # :stopdoc:
class Util
def self.html_unescape(s)
- s.to_s.gsub(/&/, "&").gsub(/"/, '"').gsub(/>/, ">").gsub(/</, "<")
+ return s unless s
+ s.gsub(/&(\w+|#[0-9]+);/) { |match|
+ number = case match
+ when /&(\w+);/
+ Hpricot::NamedCharacters[$1]
+ when /&#([0-9]+);/
+ $1.to_i
+ end
+
+ number ? (number.chr rescue match) : match
+ }
end
end
+ # :startdoc:
end
end # module WWW