# # Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de) # # This is a quick hack, to get something like Perl's WWW::Mechanize. Sure, we # have Web::Unit, but, that does not work for me as expected, as it does not # set cookies (I might be wrong), does not automatically redirect and has # problems with some html documents. Version = "0.1.3" # required due to the missing get_fields method in Ruby 1.8.2 $LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides") require 'net/http' require 'net/https' require 'web/htmltools/xmltree' # narf require 'mechanize/parsing' require 'uri' require 'logger' require 'webrick' module WWW class Field attr_accessor :name, :value def initialize(name, value) @name, @value = name, value end # Returns an array of Field objects def self.extract_all_from(root_node) fields = [] root_node.each_recursive {|node| if node.name.downcase == 'input' and ['text', 'password', 'hidden', 'checkbox'].include?(node.attributes['type'].downcase) fields << Field.new(node.attributes['name'], node.attributes['value']) end } return fields end end class FileUpload # value is the file-name, not the file-content attr_accessor :name attr_accessor :file_name, :file_data def initialize(name, file_name) @name, @file_name = name, file_name @file_data = nil end end class Button attr_accessor :name, :value def initialize(name, value) @name, @value = name, value end # Returns an array of Button objects def self.extract_all_from(root_node) buttons = [] root_node.each_recursive {|node| if node.name.downcase == 'input' and ['submit'].include?(node.attributes['type'].downcase) buttons << Button.new(node.attributes['name'], node.attributes['value']) end } return buttons end end # Class Form does not work in the case there is some invalid (unbalanced) html # involved, such as: # # #
# # # #
# # # GlobalForm takes two nodes, the node where the form tag is located # (form_node), and another node, from which to start looking for form elements # (elements_node) like buttons and the like. For class Form both fall together # into one and the same node. class GlobalForm attr_reader :form_node, :elements_node attr_reader :method, :action, :name attr_reader :fields, :buttons, :file_uploads def initialize(form_node, elements_node) @form_node, @elements_node = form_node, elements_node @method = (@form_node.attributes['method'] || 'POST').upcase @action = @form_node.attributes['action'] @name = @form_node.attributes['name'] parse end def parse @fields = [] @buttons = [] @file_uploads = [] @elements_node.each_recursive {|node| case node.name.downcase when 'input' case (node.attributes['type'] || '').downcase when 'text', 'password', 'hidden', 'checkbox' @fields << Field.new(node.attributes['name'], node.attributes['value']) when 'file' @file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value']) when 'submit' @buttons << Button.new(node.attributes['name'], node.attributes['value']) end end } end end class Form < GlobalForm attr_reader :node def initialize(node) @node = node super(@node, @node) end end class Link attr_reader :node attr_reader :href def initialize(node) @node = node @href = node.attributes['href'] end end class Page attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil) @uri, @cookies, @response, @body, @code = uri, cookies, response, body, code end def header @response.header end def content_type header['Content-Type'] end def forms parse_html() unless @forms @forms end def links parse_html() unless @links @links end def root parse_html() unless @root @root end def watches parse_html() unless @watches @watches end private def parse_html raise "no html" unless content_type() =~ /^text\/html/ # construct parser and feed with HTML parser = HTMLTree::XMLParser.new begin parser.feed(@body) rescue => ex if ex.message =~ /attempted adding second root element to document/ and # Put the whole document inside a single root element, which I simply name # , just to make the parser happy. It's no longer valid HTML, but # without a single root element, it's not valid HTML as well. # TODO: leave a possible doctype definition outside this element. parser = HTMLTree::XMLParser.new parser.feed("" + @body + "") else raise end end @root = parser.document @forms = [] @links = [] @watches = {} @root.each_recursive {|node| name = node.name.downcase case name when 'form' @forms << Form.new(node) when 'a' @links << Link.new(node) else if @watch_for_set and @watch_for_set.keys.include?( name ) @watches[name] = [] unless @watches[name] klass = @watch_for_set[name] @watches[name] << (klass ? klass.new(node) : node) end end } end end class Mechanize AGENT_ALIASES = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', } attr_accessor :log attr_accessor :user_agent attr_accessor :cookies attr_accessor :open_timeout, :read_timeout attr_accessor :watch_for_set attr_accessor :max_history def initialize @history = [] @cookies = [] @log = Logger.new(nil) yield self if block_given? end def user_agent_alias=(al) self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias") end def get(url) cur_page = current_page() || Page.new # fetch the page page = fetch_page(to_absolute_uri(url, cur_page), :get, cur_page) add_to_history(page) page end def post(url, query={}) cur_page = current_page() || Page.new request_data = [build_query_string(query)] # this is called before the request is sent pre_request_hook = proc {|request| log.debug("query: #{ query.inspect }") request.add_header('Content-Type', 'application/x-www-form-urlencoded') request.add_header('Content-Length', request_data[0].size.to_s) } # fetch the page page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data) add_to_history(page) page end def click(link) uri = to_absolute_uri(link.href) get(uri) end def submit(form, button=nil) query = {} form.fields.each do |f| query[f.name] = f.value || "" end query[button.name] = button.value || "" if button uri = to_absolute_uri(form.action) case form.method.upcase when 'POST' post(uri, query) when 'GET' get(uri + "?" + build_query_string(query)) else raise 'unsupported method' end end def current_page @history.last end alias page current_page private def to_absolute_uri(url, cur_page=current_page()) if url.is_a?(URI) uri = url else uri = URI.parse(url) end # construct an absolute uri if uri.relative? if cur_page uri = cur_page.uri + url else raise 'no history. please specify an absolute URL' end end return uri end # uri is an absolute URI def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[]) raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme) log.info("#{ method.to_s.upcase }: #{ uri.to_s }") page = Page.new(uri) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true if uri.scheme == "https" http.start { case method when :get request = Net::HTTP::Get.new(uri.request_uri) when :post request = Net::HTTP::Post.new(uri.request_uri) else raise ArgumentError end unless @cookies.empty? cookie = @cookies.uniq.join("; ") log.debug("use cookie: #{ cookie }") request.add_header('Cookie', cookie) end # Add Referer header to request unless cur_page.uri.nil? request.add_header('Referer', cur_page.uri.to_s) end # Add User-Agent header to request request.add_header('User-Agent', @user_agent) if @user_agent # Invoke pre-request-hook (use it to add custom headers or content) pre_request_hook.call(request) if pre_request_hook # Log specified headers for the request request.each_header do |k, v| log.debug("request-header: #{ k } => #{ v }") end # Specify timeouts if given http.open_timeout = @open_timeout if @open_timeout http.read_timeout = @read_timeout if @read_timeout # Send the request http.request(request, *request_data) {|response| # TODO: expire/validate cookies (response.get_fields('Set-Cookie')||[]).each do |cookie| log.debug("cookie received: #{ cookie }") @cookies << cookie.split(";").first.strip end response.each_header {|k,v| log.debug("header: #{ k } : #{ v }") } page.response = response page.code = response.code response.read_body page.body = response.body log.info("status: #{ page.code }") page.watch_for_set = @watch_for_set case page.code when "200" return page when "302" log.info("follow redirect to: #{ response.header['Location'] }") return fetch_page(to_absolute_uri(response.header['Location'], page), :get, page) else raise end } } end def build_query_string(hash) vals = [] hash.each_pair {|k,v| vals << [WEBrick::HTTPUtils.escape_form(k), WEBrick::HTTPUtils.escape_form(v)].join("=") } vals.join("&") end def add_to_history(page) @history.push(page) if @max_history and @history.size < @max_history # keep only the last @max_history entries @history = @history[@history.size - @max_history, @max_history] end end end end # module WWW