lib/mechanize.rb in mechanize-0.4.0 vs lib/mechanize.rb in mechanize-0.4.1

- old
+ new

@@ -5,392 +5,32 @@ # Copyright (c) 2006 by Aaron Patterson (aaronp@rubyforge.org) # # Please see the LICENSE file for licensing. # -Version = "0.4.0" +Version = "0.4.1" # required due to the missing get_fields method in Ruby 1.8.2 unless RUBY_VERSION > "1.8.2" $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides") end require 'net/http' require 'net/https' -require 'web/htmltools/xmltree' # narf -require 'mechanize/parsing' -require 'mechanize/cookie' require 'uri' require 'logger' require 'webrick' require 'date' +require 'web/htmltools/xmltree' # narf +require 'mechanize/parsing' +require 'mechanize/cookie' +require 'mechanize/form' +require 'mechanize/form_elements' +require 'mechanize/page' +require 'mechanize/page_elements' module WWW - -class Field - attr_accessor :name, :value - - def initialize(name, value) - @name, @value = name, value - end - - # Returns an array of Field objects - # TODO: is this correct? - def self.extract_all_from(root_node) - fields = [] - root_node.each_recursive {|node| - if (node.name.downcase == 'input' and - %w(text password hidden checkbox radio int).include?(node.attributes['type'].downcase)) or - %w(textarea option).include?(node.name.downcase) - fields << Field.new(node.attributes['name'], node.attributes['value']) - end - } - return fields - end -end - -class FileUpload - # value is the file-name, not the file-content - attr_accessor :name - - attr_accessor :file_name, :file_data - - def initialize(name, file_name) - @name, @file_name = name, file_name - @file_data = nil - end -end - -class Button - attr_accessor :name, :value - - def initialize(name, value) - @name, @value = name, value - end - - def add_to_query(query) - query[@name] = @value || "" if @name - end - - # Returns an array of Button objects - def self.extract_all_from(root_node) - buttons = [] - root_node.each_recursive {|node| - if node.name.downcase == 'input' and - ['submit'].include?(node.attributes['type'].downcase) - buttons << Button.new(node.attributes['name'], node.attributes['value']) - end - } - return buttons - end -end - -class ImageButton < Button - attr_accessor :x, :y - - def add_to_query(query) - if @name - query[@name] = @value || "" - query[@name+".x"] = (@x || "0").to_s - query[@name+".y"] = (@y || "0").to_s - end - end -end - -class RadioButton - attr_accessor :name, :value, :checked - - def initialize(name, value, checked) - @name, @value, @checked = name, value, checked - end -end - -class CheckBox - attr_accessor :name, :value, :checked - - def initialize(name, value, checked) - @name, @value, @checked = name, value, checked - end -end - -class SelectList - attr_accessor :name, :value, :options - - def initialize(name, node) - @name = name - @options = [] - - # parse - node.each_recursive {|n| - if n.name.downcase == 'option' - value = n.attributes['value'] - @options << value - @value = value if n.attributes['selected'] - end - } - end -end - -# Class Form does not work in the case there is some invalid (unbalanced) html -# involved, such as: -# -# <td> -# <form> -# </td> -# <td> -# <input .../> -# </form> -# </td> -# -# GlobalForm takes two nodes, the node where the form tag is located -# (form_node), and another node, from which to start looking for form elements -# (elements_node) like buttons and the like. For class Form both fall together -# into one and the same node. - -class GlobalForm - attr_reader :form_node, :elements_node - attr_accessor :method, :action, :name - - attr_reader :fields, :buttons, :file_uploads, :radiobuttons, :checkboxes - - def initialize(form_node, elements_node) - @form_node, @elements_node = form_node, elements_node - - @method = (@form_node.attributes['method'] || 'GET').upcase - @action = @form_node.attributes['action'] - @name = @form_node.attributes['name'] - - parse - end - - # In the case of malformed HTML, fields of multiple forms might occure in this forms' - # field array. If the fields have the same name, posterior fields overwrite former fields. - # To avoid this, this method rejects all posterior duplicate fields. - - def uniq_fields! - names_in = {} - fields.reject! {|f| - if names_in.include?(f.name) - true - else - names_in[f.name] = true - false - end - } - end - - def build_query - query = {} - - fields().each do |f| - query[f.name] = f.value || "" - end - - checkboxes().each do |f| - query[f.name] = f.value || "on" if f.checked - end - - radio_groups = {} - radiobuttons().each do |f| - radio_groups[f.name] ||= [] - radio_groups[f.name] << f - end - - # take one radio button from each group - radio_groups.each_value do |g| - checked = g.select {|f| f.checked} - - if checked.size == 1 - f = checked.first - query[f.name] = f.value || "" - elsif checked.size > 1 - raise "multiple radiobuttons are checked in the same group!" - end - end - - query - end - - def parse - @fields = [] - @buttons = [] - @file_uploads = [] - @radiobuttons = [] - @checkboxes = [] - - @elements_node.each_recursive {|node| - case node.name.downcase - when 'input' - case (node.attributes['type'] || 'text').downcase - when 'text', 'password', 'hidden', 'int' - @fields << Field.new(node.attributes['name'], node.attributes['value']) - when 'radio' - @radiobuttons << RadioButton.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked')) - when 'checkbox' - @checkboxes << CheckBox.new(node.attributes['name'], node.attributes['value'], node.attributes.has_key?('checked')) - when 'file' - @file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value']) - when 'submit' - @buttons << Button.new(node.attributes['name'], node.attributes['value']) - when 'image' - @buttons << ImageButton.new(node.attributes['name'], node.attributes['value']) - end - when 'textarea' - @fields << Field.new(node.attributes['name'], node.all_text) - when 'select' - @fields << SelectList.new(node.attributes['name'], node) - end - } - end - -end - -class Form < GlobalForm - attr_reader :node - - def initialize(node) - @node = node - super(@node, @node) - end -end - -class Link - attr_reader :node - attr_reader :href - attr_reader :text - - def initialize(node) - @node = node - @href = node.attributes['href'] - @text = node.all_text - end -end - -class Meta < Link -end - -# = Synopsis -# This class encapsulates a page. -# -# == Example -# require 'rubygems' -# require 'mechanize' -# require 'logger' -# -# class Body -# def initialize(node) -# puts node.attributes['bgcolor'] -# end -# end -# -# agent = WWW::Mechanize.new { |a| a.log = Logger.new("mech.log") } -# agent.user_agent_alias = 'Mac Safari' -# page = agent.get("http://www.google.com/") -# page.watch_for_set = { 'body' => Body } -# -# body = page.watches -class Page - attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set - - def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil) - @uri, @cookies, @response, @body, @code = uri, cookies, response, body, code - end - - def header - @response.header - end - - def content_type - header['Content-Type'] - end - - def forms - parse_html() unless @forms - @forms - end - - def links - parse_html() unless @links - @links - end - - def root - parse_html() unless @root - @root - end - - # This method watches out for a particular tag, and will call back to the - # class specified for the tag in the watch_for_set method. See the example - # in this class. - def watches - parse_html() unless @watches - @watches - end - - def meta - parse_html() unless @meta - @meta - end - - private - - def parse_html - raise "no html" unless content_type() =~ /^text\/html/ - - # construct parser and feed with HTML - parser = HTMLTree::XMLParser.new - begin - parser.feed(@body) - rescue => ex - if ex.message =~ /attempted adding second root element to document/ and - # Put the whole document inside a single root element, which I simply name - # <root>, just to make the parser happy. It's no longer valid HTML, but - # without a single root element, it's not valid HTML as well. - - # TODO: leave a possible doctype definition outside this element. - parser = HTMLTree::XMLParser.new - parser.feed("<root>" + @body + "</root>") - else - raise - end - end - - @root = parser.document - - @forms = [] - @links = [] - @meta = [] - @watches = {} - - @root.each_recursive {|node| - name = node.name.downcase - - case name - when 'form' - @forms << Form.new(node) - when 'a' - @links << Link.new(node) - when 'meta' - equiv = node.attributes['http-equiv'] - content = node.attributes['content'] - if equiv != nil && equiv.downcase == 'refresh' - if content != nil && content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i - node.attributes['href'] = $1 - @meta << Meta.new(node) - end - end - else - if @watch_for_set and @watch_for_set.keys.include?( name ) - @watches[name] = [] unless @watches[name] - klass = @watch_for_set[name] - @watches[name] << (klass ? klass.new(node) : node) - end - end - } - end -end - class ResponseCodeError < RuntimeError attr_reader :response_code def initialize(response_code) @response_code = response_code @@ -501,11 +141,11 @@ button.add_to_query(query) if button uri = to_absolute_uri(URI::escape(form.action)) case form.method.upcase when 'POST' - post(uri, query) + post_form(uri, form) when 'GET' if uri.query.nil? get(uri.to_s + "?" + build_query_string(query)) else get(uri.to_s + "&" + build_query_string(query)) @@ -538,9 +178,27 @@ raise 'no history. please specify an absolute URL' end end return uri + end + + def post_form(url, form) + cur_page = current_page() || Page.new + + request_data = [form.request_data] + + # this is called before the request is sent + pre_request_hook = proc {|request| + log.debug("query: #{ request_data.inspect }") + request.add_field('Content-Type', form.enctype) + request.add_field('Content-Length', request_data[0].size.to_s) + } + + # fetch the page + page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data) + add_to_history(page) + page end # uri is an absolute URI def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[]) raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)