#
# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de)
#
# This is a quick hack, to get something like Perl's WWW::Mechanize. Sure, we
# have Web::Unit, but, that does not work for me as expected, as it does not
# set cookies (I might be wrong), does not automatically redirect and has
# problems with some html documents.
Version = "0.1.3"
# required due to the missing get_fields method in Ruby 1.8.2
$LOAD_PATH.unshift File.join(File.dirname(__FILE__), "mechanize", "net-overrides")
require 'net/http'
require 'net/https'
require 'web/htmltools/xmltree' # narf
require 'mechanize/parsing'
require 'uri'
require 'logger'
require 'webrick'
module WWW
class Field
attr_accessor :name, :value
def initialize(name, value)
@name, @value = name, value
end
# Returns an array of Field objects
def self.extract_all_from(root_node)
fields = []
root_node.each_recursive {|node|
if node.name.downcase == 'input' and
['text', 'password', 'hidden', 'checkbox'].include?(node.attributes['type'].downcase)
fields << Field.new(node.attributes['name'], node.attributes['value'])
end
}
return fields
end
end
class FileUpload
# value is the file-name, not the file-content
attr_accessor :name
attr_accessor :file_name, :file_data
def initialize(name, file_name)
@name, @file_name = name, file_name
@file_data = nil
end
end
class Button
attr_accessor :name, :value
def initialize(name, value)
@name, @value = name, value
end
# Returns an array of Button objects
def self.extract_all_from(root_node)
buttons = []
root_node.each_recursive {|node|
if node.name.downcase == 'input' and
['submit'].include?(node.attributes['type'].downcase)
buttons << Button.new(node.attributes['name'], node.attributes['value'])
end
}
return buttons
end
end
# Class Form does not work in the case there is some invalid (unbalanced) html
# involved, such as:
#
#
#
#
#
#
#
#
# GlobalForm takes two nodes, the node where the form tag is located
# (form_node), and another node, from which to start looking for form elements
# (elements_node) like buttons and the like. For class Form both fall together
# into one and the same node.
class GlobalForm
attr_reader :form_node, :elements_node
attr_reader :method, :action, :name
attr_reader :fields, :buttons, :file_uploads
def initialize(form_node, elements_node)
@form_node, @elements_node = form_node, elements_node
@method = (@form_node.attributes['method'] || 'POST').upcase
@action = @form_node.attributes['action']
@name = @form_node.attributes['name']
parse
end
def parse
@fields = []
@buttons = []
@file_uploads = []
@elements_node.each_recursive {|node|
case node.name.downcase
when 'input'
case (node.attributes['type'] || '').downcase
when 'text', 'password', 'hidden', 'checkbox'
@fields << Field.new(node.attributes['name'], node.attributes['value'])
when 'file'
@file_uploads << FileUpload.new(node.attributes['name'], node.attributes['value'])
when 'submit'
@buttons << Button.new(node.attributes['name'], node.attributes['value'])
end
end
}
end
end
class Form < GlobalForm
attr_reader :node
def initialize(node)
@node = node
super(@node, @node)
end
end
class Link
attr_reader :node
attr_reader :href
def initialize(node)
@node = node
@href = node.attributes['href']
end
end
class Page
attr_accessor :uri, :cookies, :response, :body, :code, :watch_for_set
def initialize(uri=nil, cookies=[], response=nil, body=nil, code=nil)
@uri, @cookies, @response, @body, @code = uri, cookies, response, body, code
end
def header
@response.header
end
def content_type
header['Content-Type']
end
def forms
parse_html() unless @forms
@forms
end
def links
parse_html() unless @links
@links
end
def root
parse_html() unless @root
@root
end
def watches
parse_html() unless @watches
@watches
end
private
def parse_html
raise "no html" unless content_type() =~ /^text\/html/
# construct parser and feed with HTML
parser = HTMLTree::XMLParser.new
begin
parser.feed(@body)
rescue => ex
if ex.message =~ /attempted adding second root element to document/ and
# Put the whole document inside a single root element, which I simply name
# , just to make the parser happy. It's no longer valid HTML, but
# without a single root element, it's not valid HTML as well.
# TODO: leave a possible doctype definition outside this element.
parser = HTMLTree::XMLParser.new
parser.feed("" + @body + "")
else
raise
end
end
@root = parser.document
@forms = []
@links = []
@watches = {}
@root.each_recursive {|node|
name = node.name.downcase
case name
when 'form'
@forms << Form.new(node)
when 'a'
@links << Link.new(node)
else
if @watch_for_set and @watch_for_set.keys.include?( name )
@watches[name] = [] unless @watches[name]
klass = @watch_for_set[name]
@watches[name] << (klass ? klass.new(node) : node)
end
end
}
end
end
class Mechanize
AGENT_ALIASES = {
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85',
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
}
attr_accessor :log
attr_accessor :user_agent
attr_accessor :cookies
attr_accessor :open_timeout, :read_timeout
attr_accessor :watch_for_set
attr_accessor :max_history
def initialize
@history = []
@cookies = []
@log = Logger.new(nil)
yield self if block_given?
end
def user_agent_alias=(al)
self.user_agent = AGENT_ALIASES[al] || raise("unknown agent alias")
end
def get(url)
cur_page = current_page() || Page.new
# fetch the page
page = fetch_page(to_absolute_uri(url, cur_page), :get, cur_page)
add_to_history(page)
page
end
def post(url, query={})
cur_page = current_page() || Page.new
request_data = [build_query_string(query)]
# this is called before the request is sent
pre_request_hook = proc {|request|
log.debug("query: #{ query.inspect }")
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('Content-Length', request_data[0].size.to_s)
}
# fetch the page
page = fetch_page(to_absolute_uri(url, cur_page), :post, cur_page, pre_request_hook, request_data)
add_to_history(page)
page
end
def click(link)
uri = to_absolute_uri(link.href)
get(uri)
end
def submit(form, button=nil)
query = {}
form.fields.each do |f|
query[f.name] = f.value || ""
end
query[button.name] = button.value || "" if button
uri = to_absolute_uri(form.action)
case form.method.upcase
when 'POST'
post(uri, query)
when 'GET'
get(uri + "?" + build_query_string(query))
else
raise 'unsupported method'
end
end
def current_page
@history.last
end
alias page current_page
private
def to_absolute_uri(url, cur_page=current_page())
if url.is_a?(URI)
uri = url
else
uri = URI.parse(url)
end
# construct an absolute uri
if uri.relative?
if cur_page
uri = cur_page.uri + url
else
raise 'no history. please specify an absolute URL'
end
end
return uri
end
# uri is an absolute URI
def fetch_page(uri, method=:get, cur_page=current_page(), pre_request_hook=nil, request_data=[])
raise "unsupported scheme" unless ['http', 'https'].include?(uri.scheme)
log.info("#{ method.to_s.upcase }: #{ uri.to_s }")
page = Page.new(uri)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if uri.scheme == "https"
http.start {
case method
when :get
request = Net::HTTP::Get.new(uri.request_uri)
when :post
request = Net::HTTP::Post.new(uri.request_uri)
else
raise ArgumentError
end
unless @cookies.empty?
cookie = @cookies.uniq.join("; ")
log.debug("use cookie: #{ cookie }")
request.add_header('Cookie', cookie)
end
# Add Referer header to request
unless cur_page.uri.nil?
request.add_header('Referer', cur_page.uri.to_s)
end
# Add User-Agent header to request
request.add_header('User-Agent', @user_agent) if @user_agent
# Invoke pre-request-hook (use it to add custom headers or content)
pre_request_hook.call(request) if pre_request_hook
# Log specified headers for the request
request.each_header do |k, v|
log.debug("request-header: #{ k } => #{ v }")
end
# Specify timeouts if given
http.open_timeout = @open_timeout if @open_timeout
http.read_timeout = @read_timeout if @read_timeout
# Send the request
http.request(request, *request_data) {|response|
# TODO: expire/validate cookies
(response.get_fields('Set-Cookie')||[]).each do |cookie|
log.debug("cookie received: #{ cookie }")
@cookies << cookie.split(";").first.strip
end
response.each_header {|k,v|
log.debug("header: #{ k } : #{ v }")
}
page.response = response
page.code = response.code
response.read_body
page.body = response.body
log.info("status: #{ page.code }")
page.watch_for_set = @watch_for_set
case page.code
when "200"
return page
when "302"
log.info("follow redirect to: #{ response.header['Location'] }")
return fetch_page(to_absolute_uri(response.header['Location'], page), :get, page)
else
raise
end
}
}
end
def build_query_string(hash)
vals = []
hash.each_pair {|k,v|
vals <<
[WEBrick::HTTPUtils.escape_form(k),
WEBrick::HTTPUtils.escape_form(v)].join("=")
}
vals.join("&")
end
def add_to_history(page)
@history.push(page)
if @max_history and @history.size < @max_history
# keep only the last @max_history entries
@history = @history[@history.size - @max_history, @max_history]
end
end
end
end # module WWW