lib/ronin/web/web.rb in ronin-web-0.2.1 vs lib/ronin/web/web.rb in ronin-web-0.3.0.pre1
- old
+ new
@@ -1,27 +1,29 @@
#
# Ronin Web - A Ruby library for Ronin that provides support for web
# scraping and spidering functionality.
#
-# Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
#
-# This program is free software; you can redistribute it and/or modify
+# This file is part of Ronin Web.
+#
+# Ronin is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
-# This program is distributed in the hope that it will be useful,
+# Ronin is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+# along with Ronin. If not, see <http://www.gnu.org/licenses/>.
#
-require 'ronin/network/http'
+require 'ronin/network/http/proxy'
+require 'ronin/network/http/http'
require 'uri/http'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
@@ -39,16 +41,21 @@
# object.
#
# @yieldparam [Nokogiri::HTML::Document] doc
# The new HTML document object.
#
- # @return [Nokogiri::HTML::Document] The new HTML document object.
+ # @return [Nokogiri::HTML::Document]
+ # The new HTML document object.
#
- def Web.html(body,&block)
+ # @see http://rubydoc.info/gems/nokogiri/Nokogiri/HTML/Document
+ #
+ # @api public
+ #
+ def Web.html(body)
doc = Nokogiri::HTML(body)
- block.call(doc) if block
+ yield doc if block_given?
return doc
end
#
# Creates a new Nokogiri::HTML::Builder.
@@ -68,10 +75,14 @@
# }
# }
# }
# end
#
+ # @see http://rubydoc.info/gems/nokogiri/Nokogiri/HTML/Builder
+ #
+ # @api public
+ #
def Web.build_html(&block)
Nokogiri::HTML::Builder.new(&block)
end
#
@@ -85,16 +96,21 @@
# object.
#
# @yieldparam [Nokogiri::XML::Document] doc
# The new XML document object.
#
- # @return [Nokogiri::XML::Document] The new XML document object.
+ # @return [Nokogiri::XML::Document]
+ # The new XML document object.
#
- def Web.xml(body,&block)
+ # @see http://rubydoc.info/gems/nokogiri/Nokogiri/XML/Document
+ #
+ # @api public
+ #
+ def Web.xml(body)
doc = Nokogiri::XML(body)
- block.call(doc) if block
+ yield doc if block_given?
return doc
end
#
# Creates a new Nokogiri::XML::Builder.
@@ -111,85 +127,88 @@
# title { text('some example') }
# body { text('this is one contrived example.') }
# }
# end
#
+ # @see http://rubydoc.info/gems/nokogiri/Nokogiri/XML/Builder
+ #
+ # @api public
+ #
def Web.build_xml(&block)
Nokogiri::XML::Builder.new(&block)
end
#
- # Proxy information for Ronin::Web to use.
+ # Proxy information for {Web} to use.
#
# @return [Network::HTTP::Proxy]
# The Ronin Web proxy information.
#
- # @see Ronin::Network::HTTP.proxy
+ # @see http://rubydoc.info/gems/ronin-support/Ronin/Network/HTTP#proxy-class_method
#
+ # @api public
+ #
def Web.proxy
- Network::HTTP.proxy
+ (@proxy ||= nil) || Network::HTTP.proxy
end
#
- # Creates a HTTP URI based on a Hash of proxy information.
+ # Sets the proxy used by {Web}.
#
- # @param [Network::HTTP::Proxy, Hash, String] proxy_info
- # The proxy information.
+ # @param [Network::HTTP::Proxy, URI::HTTP, Hash, String] new_proxy
+ # The new proxy information to use.
#
- # @return [URI::HTTP, nil]
- # The HTTP URI that represents the proxy. If the proxy is diabled,
- # +nil+ will be returned.
+ # @return [Network::HTTP::Proxy]
+ # The new proxy.
#
- # @example
- # Web.proxy_url
- # # => "http://www.example.com:8080"
+ # @since 0.3.0
#
- # @example
- # Web.proxy_url({:host => 'www.example.com', :port => 8081})
- # # => "http://www.example.com:8081"
+ # @api public
#
- # @example
- # Web.proxy_url('www.example.com:9000')
- # # => "http://www.example.com:9000"
- #
- def Web.proxy_url(proxy_info=Web.proxy)
- proxy = if proxy_info.kind_of?(Hash)
- Network::HTTP::Proxy.new(proxy_info)
- else
- Network::HTTP::Proxy.parse(proxy_info)
- end
-
- return proxy.url
+ def Web.proxy=(new_proxy)
+ @proxy = Network::HTTP::Proxy.create(new_proxy)
end
#
# @return [Array]
# The supported Web User-Agent Aliases.
#
+ # @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant
+ #
+ # @api public
+ #
def Web.user_agent_aliases
- WWW::Mechanize::AGENT_ALIASES
+ Mechanize::AGENT_ALIASES
end
#
+ # The User-Agent string used by {Web}.
+ #
# @return [String, nil]
# The Ronin Web User-Agent
#
+ # @see http://rubydoc.info/gems/ronin-support/Ronin/Network/HTTP#user_agent-class_method
+ #
+ # @api public
+ #
def Web.user_agent
- Network::HTTP.user_agent
+ (@user_agent ||= nil) || Network::HTTP.user_agent
end
#
- # Sets the Ronin Web User-Agent.
+ # Sets the User-Agent string used by {Web}.
#
# @param [String] new_agent
# The User-Agent string to use.
#
# @return [String]
# The new User-Agent string.
#
+ # @api public
+ #
def Web.user_agent=(new_agent)
- Network::HTTP.user_agent = new_agent
+ @user_agent = new_agent
end
#
# Sets the Ronin Web User-Agent.
#
@@ -197,12 +216,16 @@
# The User-Agent alias to use.
#
# @return [String]
# The new User-Agent string.
#
+ # @see user_agent_aliases
+ #
+ # @api public
+ #
def Web.user_agent_alias=(name)
- Network::HTTP.user_agent = Web.user_agent_aliases[name.to_s]
+ @user_agent = Web.user_agent_aliases[name.to_s]
end
#
# Opens a URL as a temporary file.
#
@@ -244,13 +267,19 @@
# :user_agent_alias => 'Linux Mozilla')
#
# @example Open a given URL, using a custom User-Agent string.
# Web.open('http://www.wired.com/', :user_agent => 'the future')
#
+ # @see http://rubydoc.info/stdlib/open-uri/frames
+ #
+ # @api public
+ #
def Web.open(url,options={})
user_agent_alias = options.delete(:user_agent_alias)
- proxy = (options.delete(:proxy) || Web.proxy)
+ proxy = Network::HTTP::Proxy.create(
+ options.delete(:proxy) || Web.proxy
+ )
user = options.delete(:user)
password = options.delete(:password)
content_length_proc = options.delete(:content_length_proc)
progress_proc = options.delete(:progress_proc)
@@ -259,11 +288,11 @@
if user_agent_alias
headers['User-Agent'] = Web.user_agent_aliases[user_agent_alias]
end
if proxy[:host]
- headers[:proxy] = Web.proxy_url(proxy)
+ headers[:proxy] = proxy.url
end
if user
headers[:http_basic_authentication] = [user, password]
end
@@ -297,14 +326,14 @@
#
# @yield [agent]
# If a block is given, it will be passed the newly created Mechanize
# agent.
#
- # @yieldparam [WWW::Mechanize] agent
+ # @yieldparam [Mechanize] agent
# The new Mechanize agent.
#
- # @return [WWW::Mechanize]
+ # @return [Mechanize]
# The new Mechanize agent.
#
# @example Create a new agent.
# Web.agent
#
@@ -312,38 +341,37 @@
# Web.agent(:user_agent_alias => 'Linux Mozilla')
#
# @example Create a new agent, with a custom User-Agent string.
# Web.agent(:user_agent => 'wooden pants')
#
- # @see http://mechanize.rubyforge.org/mechanize/WWW/Mechanize.html
+ # @see http://rubydoc.info/gems/mechanize/Mechanize
#
- def Web.agent(options={},&block)
- agent = WWW::Mechanize.new
+ # @api public
+ #
+ def Web.agent(options={})
+ agent = Mechanize.new
if options[:user_agent_alias]
agent.user_agent_alias = options[:user_agent_alias]
elsif options[:user_agent]
agent.user_agent = options[:user_agent]
elsif Web.user_agent
agent.user_agent = Web.user_agent
end
- proxy = if options[:proxy].kind_of?(Hash)
- options[:proxy]
- elsif options[:proxy].kind_of?(String)
- Network::HTTP::Proxy.parse(options[:proxy])
- elsif options[:proxy].nil?
- Web.proxy
- else
- raise(RuntimeError,"the given :proxy option is neither a Proxy, Hash or String",caller)
- end
+ proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy)
if proxy[:host]
- agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password])
+ agent.set_proxy(
+ proxy[:host],
+ proxy[:port],
+ proxy[:user],
+ proxy[:password]
+ )
end
- block.call(agent) if block
+ yield agent if block_given?
return agent
end
#
# Creates a Mechanize Page for the contents at a given URL.
@@ -365,33 +393,35 @@
#
# @yield [page]
# If a block is given, it will be passed the page for the requested
# URL.
#
- # @yieldparam [WWW::Mechanize::Page] page
+ # @yieldparam [Mechanize::Page] page
# The requested page.
#
- # @return [WWW::Mechanize::Page]
+ # @return [Mechanize::Page]
# The requested page.
#
# @example
# Web.get('http://www.rubyinside.com')
- # # => WWW::Mechanize::Page
+ # # => Mechanize::Page
#
# @example
# Web.get('http://www.rubyinside.com') do |page|
# page.search('div.post/h2/a').each do |title|
# puts title.inner_text
# end
# end
#
- # @see http://mechanize.rubyforge.org/mechanize/WWW/Mechanize/Page.html
+ # @see http://rubydoc.info/gems/mechanize/Mechanize/Page
#
- def Web.get(url,options={},&block)
+ # @api public
+ #
+ def Web.get(url,options={})
page = Web.agent(options).get(url)
- block.call(page) if block
+ yield page if block_given?
return page
end
#
# Requests the body of the Mechanize Page created from the response
@@ -427,14 +457,18 @@
# @example
# Web.get_body('http://www.rubyinside.com') do |body|
# puts body
# end
#
- def Web.get_body(url,options={},&block)
+ # @see get
+ #
+ # @api public
+ #
+ def Web.get_body(url,options={})
body = Web.get(url,options).body
- block.call(body) if block
+ yield body if block_given?
return body
end
#
# Posts to a given URL and creates a Mechanize Page from the response.
@@ -459,27 +493,31 @@
#
# @yield [page]
# If a block is given, it will be passed the page for the requested
# URL.
#
- # @yieldparam [WWW::Mechanize::Page] page
+ # @yieldparam [Mechanize::Page] page
# The requested page.
#
- # @return [WWW::Mechanize::Page]
+ # @return [Mechanize::Page]
# The requested page.
#
# @example
# Web.post('http://www.rubyinside.com')
- # # => WWW::Mechanize::Page
+ # # => Mechanize::Page
#
- def Web.post(url,options={},&block)
+ # @see http://rubydoc.info/gems/mechanize/Mechanize/Page
+ #
+ # @api public
+ #
+ def Web.post(url,options={})
query = {}
query.merge!(options[:query]) if options[:query]
page = Web.agent(options).post(url,query)
- block.call(page) if block
+ yield page if block_given?
return page
end
#
# Posts to a given URL and returns the body of the Mechanize Page
@@ -504,14 +542,14 @@
# Proxy information.
#
# @yield [body]
# If a block is given, it will be passed the body of the page.
#
- # @yieldparam [WWW::Mechanize::Page] page
+ # @yieldparam [Mechanize::Page] page
# The body of the requested page.
#
- # @return [WWW::Mechanize::Page]
+ # @return [Mechanize::Page]
# The body of the requested page.
#
# @example
# Web.post_body('http://www.rubyinside.com')
# # => String
@@ -519,13 +557,17 @@
# @example
# Web.post_body('http://www.rubyinside.com') do |body|
# puts body
# end
#
- def Web.post_body(url,options={},&block)
+ # @see post
+ #
+ # @api public
+ #
+ def Web.post_body(url,options={})
body = Web.post(url,options).body
- block.call(body) if block
+ yield body if block_given?
return body
end
end
end