lib/mechanize.rb in mechanize-0.4.0 vs lib/mechanize.rb in mechanize-0.4.1
- old
+ new
@@ -5,392 +5,32 @@
# Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org)
#
# Please see the LICENSE file for licensing.
#
-Version = "0.4.0"
+Version = "0.4.1"
# required due to the missing get_fields method in Ruby 1.8.2
unless RUBY_VERSION > "1.8.2"
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
end
require 'net/http'
require 'net/https'
-require 'web/htmltools/xmltree' # narf
-require 'mechanize/parsing'
-require 'mechanize/cookie'
require 'uri'
require 'logger'
require 'webrick'
require 'date'
+require 'web/htmltools/xmltree' # narf
+require 'mechanize/parsing'
+require 'mechanize/cookie'
+require 'mechanize/form'
+require 'mechanize/form_elements'
+require 'mechanize/page'
+require 'mechanize/page_elements'
module WWW
-
-class Field
- attr_accessor :name, :value
-
- def initialize(name, value)
- @name, @value = name, value
- end
-
- # Returns an array of Field objects
- # TODO: is this correct?
- def self.extract_all_from(root_node)
- fields = []
- root_node.each_recursive {|node|
- if (node.name.downcase == 'input' and
- %w(text password hidden checkbox radio int).include?(node.attributes['type'].downcase)) or
- %w(textarea option).include?(node.name.downcase)
- fields << Field.new(node.attributes['name'], node.attributes['value'])
- end
- }
- return fields
- end
-end
-
-class FileUpload
- # value is the file-name, not the file-content
- attr_accessor :name
-
- attr_accessor :file_name, :file_data
-
- def initialize(name, file_name)
- @name, @file_name = name, file_name
- @file_data = nil
- end
-end
-
-class Button
- attr_accessor :name, :value
-
- def initialize(name, value)
- @name, @value = name, value
- end
-
- def add_to_query(query)
- query[@name] = @value || "" if @name
- end
-
- # Returns an array of Button objects
- def self.extract_all_from(root_node)
- buttons = []
- root_node.each_recursive {|node|
- if node.name.downcase == 'input' and
- ['submit'].include?(node.attributes['type'].downcase)
- buttons << Button.new(node.attributes['name'], node.attributes['value'])
- end
- }
- return buttons
- end
-end
-
-class ImageButton < Button
- attr_accessor :x, :y
-
- def add_to_query(query)
- if @name
- query[@name] = @value || ""
- query[@name+".x"] = (@x || "0").to_s
- query[@name+".y"] = (@y || "0").to_s
- end
- end
-end
-
-class RadioButton
- attr_accessor :name, :value, :checked
-
- def initialize(name, value, checked)
- @name, @value, @checked = name, value, checked
- end
-end
-
-class CheckBox
- attr_accessor :name, :value, :checked
-
- def initialize(name, value, checked)
- @name, @value, @checked = name, value, checked
- end
-end
-
-class SelectList
- attr_accessor :name, :value, :options
-
- def initialize(name, node)
- @name = name
- @options = []
-
- # parse
- node.each_recursive {|n|
- if n.name.downcase == 'option'
- value = n.attributes['value']
- @options << value
- @value = value if n.attributes['selected']
- end
- }
- end
-end
-
-# Class Form does not work in the case there is some invalid (unbalanced) html
-# involved, such as:
-#
-# <td>
-# <form>
-# </td>
-# <td>
-# <input .../>
-# </form>
-# </td>
-#
-# GlobalForm takes two nodes, the node where the form tag is located
-# (form_node), and another node, from which to start looking for form elements
-# (elements_node) like buttons and the like. For class Form both fall together
-# into one and the same node.
-
-class GlobalForm
- attr_reader :form_node, :elements_node
- attr_accessor :method, :action, :name
-
- attr_reader :fields, :buttons, :file_uploads, :radiobuttons, :checkboxes
-
- def initialize(form_node, elements_node)
- @form_node, @elements_node = form_node, elements_node
-
- @method = (@form_node.attributes['method'] || 'GET').upcase
- @action = @form_node.attributes['action']
- @name = @form_node.attributes['name']
-
- parse
- end
-
- # In the case of malformed HTML, fields of multiple forms might occure in this forms'
- # field array. If the fields have the same name, posterior fields overwrite former fields.
- # To avoid this, this method rejects all posterior duplicate fields.
-
- def uniq_fields!
- names_in = {}
- fields.reject! {|f|
- if names_in.include?(f.name)
- true
- else
- names_in[f.name] = true
- false
- end
- }
- end
-
- def build_query
- query = {}
-
- fields().each do |f|
- query[f.name] = f.value || ""
- end
-
- checkboxes().each do |f|
- query[f.name] = f.value || "on" if f.checked
- end
-
- radio_groups = {}
- radiobuttons().each do |f|
- radio_groups[f.name] ||= []
- radio_groups[f.name] << f
- end
-
- # take one radio button from each group
- radio_groups.each_value do |g|
- checked = g.select {|f| f.checked}
-
- if checked.size == 1
- f = checked.first
- query[f.name] = f.value || ""
- elsif checked.size > 1
- raise "multiple radiobuttons are checked in the same group!"
- end
- end
-
- query
- end
-
- def parse
- @fields = []
- @buttons = []
- @file_uploads = []
- @radiobuttons = []
- @checkboxes = []
-
- @elements_node.each_recursive {|node|
- case node.name.downcase
- when 'input'
- case (node.attributes['type'] || 'text').downcase
- when 'text', 'password', 'hidden', 'int'
- @fields << Field.new(node.attributes['name'], node.attributes['value'])
- when 'radio'
- @radiobuttons << RadioButton.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked'))
- when 'checkbox'
- @checkboxes << CheckBox.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked'))
- when 'file'
- @file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value'])
- when 'submit'
- @buttons << Button.new(node.attributes['name'], node.attributes['value'])
- when 'image'
- @buttons << ImageButton.new(node.attributes['name'], node.attributes['value'])
- end
- when 'textarea'
- @fields << Field.new(node.attributes['name'], node.all_text)
- when 'select'
- @fields << SelectList.new(node.attributes['name'], node)
- end
- }
- end
-
-end
-
-class Form < GlobalForm
- attr_reader :node
-
- def initialize(node)
- @node = node
- super(@node, @node)
- end
-end
-
-class Link
- attr_reader :node
- attr_reader :href
- attr_reader :text
-
- def initialize(node)
- @node = node
- @href = node.attributes['href']
- @text = node.all_text
- end
-end
-
-class Meta < Link
-end
-
-# = Synopsis
-# This class encapsulates a page.
-#
-# == Example
-# require 'rubygems'
-# require 'mechanize'
-# require 'logger'
-#
-# class Body
-# def initialize(node)
-# puts node.attributes['bgcolor']
-# end
-# end
-#
-# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") }
-# agent.user_agent_alias = 'Mac Safari'
-# page = agent.get("http://www.google.com/")
-# page.watch_for_set = { 'body' => Body }
-#
-# body = page.watches
-class Page
- attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set
-
- def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil)
- @uri, @cookies, @response, @body, @code = uri, cookies, response, body, code
- end
-
- def header
- @response.header
- end
-
- def content_type
- header['Content-Type']
- end
-
- def forms
- parse_html() unless @forms
- @forms
- end
-
- def links
- parse_html() unless @links
- @links
- end
-
- def root
- parse_html() unless @root
- @root
- end
-
- # This method watches out for a particular tag, and will call back to the
- # class specified for the tag in the watch_for_set method. See the example
- # in this class.
- def watches
- parse_html() unless @watches
- @watches
- end
-
- def meta
- parse_html() unless @meta
- @meta
- end
-
- private
-
- def parse_html
- raise "no html" unless content_type() =~ /^text\/html/
-
- # construct parser and feed with HTML
- parser = HTMLTree::XMLParser.new
- begin
- parser.feed(@body)
- rescue => ex
- if ex.message =~ /attempted adding second root element to document/ and
- # Put the whole document inside a single root element, which I simply name
- # <root>, just to make the parser happy. It's no longer valid HTML, but
- # without a single root element, it's not valid HTML as well.
-
- # TODO: leave a possible doctype definition outside this element.
- parser = HTMLTree::XMLParser.new
- parser.feed("<root>" + @body + "</root>")
- else
- raise
- end
- end
-
- @root = parser.document
-
- @forms = []
- @links = []
- @meta = []
- @watches = {}
-
- @root.each_recursive {|node|
- name = node.name.downcase
-
- case name
- when 'form'
- @forms << Form.new(node)
- when 'a'
- @links << Link.new(node)
- when 'meta'
- equiv = node.attributes['http-equiv']
- content = node.attributes['content']
- if equiv != nil && equiv.downcase == 'refresh'
- if content != nil && content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i
- node.attributes['href'] = $1
- @meta << Meta.new(node)
- end
- end
- else
- if @watch_for_set and @watch_for_set.keys.include?( name )
- @watches[name] = [] unless @watches[name]
- klass = @watch_for_set[name]
- @watches[name] << (klass ? klass.new(node) : node)
- end
- end
- }
- end
-end
-
class ResponseCodeError < RuntimeError
attr_reader :response_code
def initialize(response_code)
@response_code = response_code
@@ -501,11 +141,11 @@
button.add_to_query(query) if button
uri = to_absolute_uri(URI::escape(form.action))
case form.method.upcase
when 'POST'
- post(uri, query)
+ post_form(uri, form)
when 'GET'
if uri.query.nil?
get(uri.to_s + "?" + build_query_string(query))
else
get(uri.to_s + "&" + build_query_string(query))
@@ -538,9 +178,27 @@
raise 'no history. please specify an absolute URL'
end
end
return uri
+ end
+
+ def post_form(url, form)
+ cur_page = current_page() || Page.new
+
+ request_data = [form.request_data]
+
+ # this is called before the request is sent
+ pre_request_hook = proc {|request|
+ log.debug("query: #{ request_data.inspect }")
+ request.add_field('Content-Type', form.enctype)
+ request.add_field('Content-Length', request_data[0].size.to_s)
+ }
+
+ # fetch the page
+ page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data)
+ add_to_history(page)
+ page
end
# uri is an absolute URI
def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[])
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)