lib/ronin/web/web.rb in ronin-web-0.2.1 vs lib/ronin/web/web.rb in ronin-web-0.3.0.pre1

- old
+ new

@@ -1,27 +1,29 @@ # # Ronin Web - A Ruby library for Ronin that provides support for web # scraping and spidering functionality. # -# Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com) +# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com) # -# This program is free software; you can redistribute it and/or modify +# This file is part of Ronin Web. +# +# Ronin is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # -# This program is distributed in the hope that it will be useful, +# Ronin is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# along with Ronin. If not, see <http://www.gnu.org/licenses/>. # -require 'ronin/network/http' +require 'ronin/network/http/proxy' +require 'ronin/network/http/http' require 'uri/http' require 'nokogiri' require 'mechanize' require 'open-uri' @@ -39,16 +41,21 @@ # object. # # @yieldparam [Nokogiri::HTML::Document] doc # The new HTML document object. # - # @return [Nokogiri::HTML::Document] The new HTML document object. + # @return [Nokogiri::HTML::Document] + # The new HTML document object. # - def Web.html(body,&block) + # @see http://rubydoc.info/gems/nokogiri/Nokogiri/HTML/Document + # + # @api public + # + def Web.html(body) doc = Nokogiri::HTML(body) - block.call(doc) if block + yield doc if block_given? return doc end # # Creates a new Nokogiri::HTML::Builder. @@ -68,10 +75,14 @@ # } # } # } # end # + # @see http://rubydoc.info/gems/nokogiri/Nokogiri/HTML/Builder + # + # @api public + # def Web.build_html(&block) Nokogiri::HTML::Builder.new(&block) end # @@ -85,16 +96,21 @@ # object. # # @yieldparam [Nokogiri::XML::Document] doc # The new XML document object. # - # @return [Nokogiri::XML::Document] The new XML document object. + # @return [Nokogiri::XML::Document] + # The new XML document object. # - def Web.xml(body,&block) + # @see http://rubydoc.info/gems/nokogiri/Nokogiri/XML/Document + # + # @api public + # + def Web.xml(body) doc = Nokogiri::XML(body) - block.call(doc) if block + yield doc if block_given? return doc end # # Creates a new Nokogiri::XML::Builder. @@ -111,85 +127,88 @@ # title { text('some example') } # body { text('this is one contrived example.') } # } # end # + # @see http://rubydoc.info/gems/nokogiri/Nokogiri/XML/Builder + # + # @api public + # def Web.build_xml(&block) Nokogiri::XML::Builder.new(&block) end # - # Proxy information for Ronin::Web to use. + # Proxy information for {Web} to use. # # @return [Network::HTTP::Proxy] # The Ronin Web proxy information. # - # @see Ronin::Network::HTTP.proxy + # @see http://rubydoc.info/gems/ronin-support/Ronin/Network/HTTP#proxy-class_method # + # @api public + # def Web.proxy - Network::HTTP.proxy + (@proxy ||= nil) || Network::HTTP.proxy end # - # Creates a HTTP URI based on a Hash of proxy information. + # Sets the proxy used by {Web}. # - # @param [Network::HTTP::Proxy, Hash, String] proxy_info - # The proxy information. + # @param [Network::HTTP::Proxy, URI::HTTP, Hash, String] new_proxy + # The new proxy information to use. # - # @return [URI::HTTP, nil] - # The HTTP URI that represents the proxy. If the proxy is diabled, - # +nil+ will be returned. + # @return [Network::HTTP::Proxy] + # The new proxy. # - # @example - # Web.proxy_url - # # => "http://www.example.com:8080" + # @since 0.3.0 # - # @example - # Web.proxy_url({:host => 'www.example.com', :port => 8081}) - # # => "http://www.example.com:8081" + # @api public # - # @example - # Web.proxy_url('www.example.com:9000') - # # => "http://www.example.com:9000" - # - def Web.proxy_url(proxy_info=Web.proxy) - proxy = if proxy_info.kind_of?(Hash) - Network::HTTP::Proxy.new(proxy_info) - else - Network::HTTP::Proxy.parse(proxy_info) - end - - return proxy.url + def Web.proxy=(new_proxy) + @proxy = Network::HTTP::Proxy.create(new_proxy) end # # @return [Array] # The supported Web User-Agent Aliases. # + # @see http://rubydoc.info/gems/mechanize/Mechanize#AGENT_ALIASES-constant + # + # @api public + # def Web.user_agent_aliases - WWW::Mechanize::AGENT_ALIASES + Mechanize::AGENT_ALIASES end # + # The User-Agent string used by {Web}. + # # @return [String, nil] # The Ronin Web User-Agent # + # @see http://rubydoc.info/gems/ronin-support/Ronin/Network/HTTP#user_agent-class_method + # + # @api public + # def Web.user_agent - Network::HTTP.user_agent + (@user_agent ||= nil) || Network::HTTP.user_agent end # - # Sets the Ronin Web User-Agent. + # Sets the User-Agent string used by {Web}. # # @param [String] new_agent # The User-Agent string to use. # # @return [String] # The new User-Agent string. # + # @api public + # def Web.user_agent=(new_agent) - Network::HTTP.user_agent = new_agent + @user_agent = new_agent end # # Sets the Ronin Web User-Agent. # @@ -197,12 +216,16 @@ # The User-Agent alias to use. # # @return [String] # The new User-Agent string. # + # @see user_agent_aliases + # + # @api public + # def Web.user_agent_alias=(name) - Network::HTTP.user_agent = Web.user_agent_aliases[name.to_s] + @user_agent = Web.user_agent_aliases[name.to_s] end # # Opens a URL as a temporary file. # @@ -244,13 +267,19 @@ # :user_agent_alias => 'Linux Mozilla') # # @example Open a given URL, using a custom User-Agent string. # Web.open('http://www.wired.com/', :user_agent => 'the future') # + # @see http://rubydoc.info/stdlib/open-uri/frames + # + # @api public + # def Web.open(url,options={}) user_agent_alias = options.delete(:user_agent_alias) - proxy = (options.delete(:proxy) || Web.proxy) + proxy = Network::HTTP::Proxy.create( + options.delete(:proxy) || Web.proxy + ) user = options.delete(:user) password = options.delete(:password) content_length_proc = options.delete(:content_length_proc) progress_proc = options.delete(:progress_proc) @@ -259,11 +288,11 @@ if user_agent_alias headers['User-Agent'] = Web.user_agent_aliases[user_agent_alias] end if proxy[:host] - headers[:proxy] = Web.proxy_url(proxy) + headers[:proxy] = proxy.url end if user headers[:http_basic_authentication] = [user, password] end @@ -297,14 +326,14 @@ # # @yield [agent] # If a block is given, it will be passed the newly created Mechanize # agent. # - # @yieldparam [WWW::Mechanize] agent + # @yieldparam [Mechanize] agent # The new Mechanize agent. # - # @return [WWW::Mechanize] + # @return [Mechanize] # The new Mechanize agent. # # @example Create a new agent. # Web.agent # @@ -312,38 +341,37 @@ # Web.agent(:user_agent_alias => 'Linux Mozilla') # # @example Create a new agent, with a custom User-Agent string. # Web.agent(:user_agent => 'wooden pants') # - # @see http://mechanize.rubyforge.org/mechanize/WWW/Mechanize.html + # @see http://rubydoc.info/gems/mechanize/Mechanize # - def Web.agent(options={},&block) - agent = WWW::Mechanize.new + # @api public + # + def Web.agent(options={}) + agent = Mechanize.new if options[:user_agent_alias] agent.user_agent_alias = options[:user_agent_alias] elsif options[:user_agent] agent.user_agent = options[:user_agent] elsif Web.user_agent agent.user_agent = Web.user_agent end - proxy = if options[:proxy].kind_of?(Hash) - options[:proxy] - elsif options[:proxy].kind_of?(String) - Network::HTTP::Proxy.parse(options[:proxy]) - elsif options[:proxy].nil? - Web.proxy - else - raise(RuntimeError,"the given :proxy option is neither a Proxy, Hash or String",caller) - end + proxy = Network::HTTP::Proxy.new(options[:proxy] || Web.proxy) if proxy[:host] - agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password]) + agent.set_proxy( + proxy[:host], + proxy[:port], + proxy[:user], + proxy[:password] + ) end - block.call(agent) if block + yield agent if block_given? return agent end # # Creates a Mechanize Page for the contents at a given URL. @@ -365,33 +393,35 @@ # # @yield [page] # If a block is given, it will be passed the page for the requested # URL. # - # @yieldparam [WWW::Mechanize::Page] page + # @yieldparam [Mechanize::Page] page # The requested page. # - # @return [WWW::Mechanize::Page] + # @return [Mechanize::Page] # The requested page. # # @example # Web.get('http://www.rubyinside.com') - # # => WWW::Mechanize::Page + # # => Mechanize::Page # # @example # Web.get('http://www.rubyinside.com') do |page| # page.search('div.post/h2/a').each do |title| # puts title.inner_text # end # end # - # @see http://mechanize.rubyforge.org/mechanize/WWW/Mechanize/Page.html + # @see http://rubydoc.info/gems/mechanize/Mechanize/Page # - def Web.get(url,options={},&block) + # @api public + # + def Web.get(url,options={}) page = Web.agent(options).get(url) - block.call(page) if block + yield page if block_given? return page end # # Requests the body of the Mechanize Page created from the response @@ -427,14 +457,18 @@ # @example # Web.get_body('http://www.rubyinside.com') do |body| # puts body # end # - def Web.get_body(url,options={},&block) + # @see get + # + # @api public + # + def Web.get_body(url,options={}) body = Web.get(url,options).body - block.call(body) if block + yield body if block_given? return body end # # Posts to a given URL and creates a Mechanize Page from the response. @@ -459,27 +493,31 @@ # # @yield [page] # If a block is given, it will be passed the page for the requested # URL. # - # @yieldparam [WWW::Mechanize::Page] page + # @yieldparam [Mechanize::Page] page # The requested page. # - # @return [WWW::Mechanize::Page] + # @return [Mechanize::Page] # The requested page. # # @example # Web.post('http://www.rubyinside.com') - # # => WWW::Mechanize::Page + # # => Mechanize::Page # - def Web.post(url,options={},&block) + # @see http://rubydoc.info/gems/mechanize/Mechanize/Page + # + # @api public + # + def Web.post(url,options={}) query = {} query.merge!(options[:query]) if options[:query] page = Web.agent(options).post(url,query) - block.call(page) if block + yield page if block_given? return page end # # Posts to a given URL and returns the body of the Mechanize Page @@ -504,14 +542,14 @@ # Proxy information. # # @yield [body] # If a block is given, it will be passed the body of the page. # - # @yieldparam [WWW::Mechanize::Page] page + # @yieldparam [Mechanize::Page] page # The body of the requested page. # - # @return [WWW::Mechanize::Page] + # @return [Mechanize::Page] # The body of the requested page. # # @example # Web.post_body('http://www.rubyinside.com') # # => String @@ -519,13 +557,17 @@ # @example # Web.post_body('http://www.rubyinside.com') do |body| # puts body # end # - def Web.post_body(url,options={},&block) + # @see post + # + # @api public + # + def Web.post_body(url,options={}) body = Web.post(url,options).body - block.call(body) if block + yield body if block_given? return body end end end