# #-- # Ronin Web - A Ruby library for Ronin that provides support for web # scraping and spidering functionality. # # Copyright (c) 2006-2009 Hal Brodigan (postmodern.mod3 at gmail.com) # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #++ # require 'ronin/network/http' require 'uri/http' require 'nokogiri' require 'mechanize' require 'open-uri' module Ronin module Web # # Returns a Nokogiri::HTML::Document object for the specified _body_ # of html. # def Web.html(body) Nokogiri::HTML(body) end # # Returns a Nokogiri::XML::Document object for the specified _body_ # of xml. # def Web.xml(body) Nokogiri::XML(body) end # # Returns the default Ronin Web proxy port. # def Web.default_proxy_port Network::HTTP.default_proxy_port end # # Sets the default Ronin Web proxy port to the specified _port_. # def Web.default_proxy_port=(port) Network::HTTP.default_proxy_port = port end # # Returns the +Hash+ of the Ronin Web proxy information. # def Web.proxy Network::HTTP.proxy end # # Resets the Web proxy settings. # def Web.disable_proxy Network::HTTP.disable_proxy end # # Creates a HTTP URI based from the given _proxy_info_ hash. The # _proxy_info_ hash defaults to Web.proxy, if not given. # def Web.proxy_url(proxy_info=Web.proxy) if Web.proxy[:host] userinfo = nil if (Web.proxy[:user] || Web.proxy[:password]) userinfo = "#{Web.proxy[:user]}:#{Web.proxy[:password]}" end return URI::HTTP.build( :host => Web.proxy[:host], :port => Web.proxy[:port], :userinfo => userinfo, :path => '/' ) end end # # Returns the supported Web User-Agent Aliases. # def Web.user_agent_aliases WWW::Mechanize::AGENT_ALIASES end # # Returns the Ronin Web User-Agent # def Web.user_agent Network::HTTP.user_agent end # # Sets the Ronin Web User-Agent to the specified _new_agent_. # def Web.user_agent=(new_agent) Network::HTTP.user_agent = new_agent end # # Sets the Ronin Web User-Agent to the specified user agent alias # _name_. # def Web.user_agent_alias=(name) Network::HTTP.user_agent = Web.user_agent_aliases[name.to_s] end # # Opens the _url_ with the given _options_. The contents of the _url_ # will be returned. # # _options_ may contain the following keys: # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # :user:: The HTTP Basic Authentication user name. # :password:: The HTTP Basic Authentication password. # :content_length_proc:: A callback which will be passed the # content-length of the HTTP response. # :progress_proc:: A callback which will be passed the size # of each fragment, once received from the # server. # # Web.open('http://www.hackety.org/') # # Web.open('http://tenderlovemaking.com/', # :user_agent_alias => 'Linux Mozilla') # # Web.open('http://www.wired.com/', :user_agent => 'the future') # def Web.open(url,options={}) user_agent_alias = options.delete(:user_agent_alias) proxy = (options.delete(:proxy) || Web.proxy) user = options.delete(:user) password = options.delete(:password) content_length_proc = options.delete(:content_length_proc) progress_proc = options.delete(:progress_proc) headers = Network::HTTP.headers(options) if user_agent_alias headers['User-Agent'] = Web.user_agent_aliases[user_agent_alias] end if proxy[:host] headers[:proxy] = Web.proxy_url(proxy) end if user headers[:http_basic_authentication] = [user, password] end if content_length_proc headers[:content_length_proc] = content_length_proc end if progress_proc headers[:progress_proc] = progress_proc end return Kernel.open(url,headers) end # # Creates a new Mechanize agent with the given _options_. # # _options_ may contain the following keys: # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # # Web.agent # Web.agent(:user_agent_alias => 'Linux Mozilla') # Web.agent(:user_agent => 'wooden pants') # def Web.agent(options={},&block) agent = WWW::Mechanize.new if options[:user_agent_alias] agent.user_agent_alias = options[:user_agent_alias] elsif options[:user_agent] agent.user_agent = options[:user_agent] elsif Web.user_agent agent.user_agent = Web.user_agent end proxy = (options[:proxy] || Web.proxy) if proxy[:host] agent.set_proxy(proxy[:host],proxy[:port],proxy[:user],proxy[:password]) end block.call(agent) if block return agent end # # Gets the specified _url_ with the given _options_. If a _block_ is # given, it will be passed the retrieved page. # # _options_ may contain the following keys: # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # # Web.get('http://www.0x000000.com') # => WWW::Mechanize::Page # # Web.get('http://www.rubyinside.com') do |page| # page.search('div.post/h2/a').each do |title| # puts title.inner_text # end # end # def Web.get(url,options={},&block) page = Web.agent(options).get(url) block.call(page) if block return page end # # Gets the specified _url_ with the given _options_, returning the body # of the requested page. If a _block_ is given, it will be passed the # body of the retrieved page. # # _options_ may contain the following keys: # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # # Web.get_body('http://www.rubyinside.com') # => String # # Web.get_body('http://www.rubyinside.com') do |body| # puts body # end # def Web.get_body(url,options={},&block) body = Web.get(url,options).body block.call(body) if block return body end # # Posts the specified _url_ with the given _options_. If a _block_ is # given, it will be passed the posted page. # # _options_ may contain the following keys: # :query:: The query parameters to post to the specified _url_. # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # # Web.post('http://www.rubyinside.com') # => WWW::Mechanize::Page # def Web.post(url,options={},&block) query = (options[:query] || {}) page = Web.agent(options).post(url,query) block.call(page) if block return page end # # Poststhe specified _url_ with the given _options_, returning the body # of the posted page. If a _block_ is given, it will be passed the # body of the posted page. # # _options_ may contain the following keys: # :user_agent_alias:: The User-Agent Alias to use. # :user_agent:: The User-Agent string to use. # :proxy:: A +Hash+ of the proxy information to use. # # Web.post_body('http://www.rubyinside.com') # => String # # Web.post_body('http://www.rubyinside.com') do |body| # puts body # end # def Web.post_body(url,options={},&block) body = Web.post(url,options).body block.call(body) if block return body end end end