#
# Ronin Web - A Ruby library for Ronin that provides support for web
# scraping and spidering functionality.
#
# Copyright (c) 2006-2011 Hal Brodigan (postmodern.mod3 at gmail.com)
#
# This file is part of Ronin Web.
#
# Ronin is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ronin is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ronin. If not, see .
#
require 'ronin/web/web'
require 'ronin/ui/output/helpers'
require 'spidr/agent'
module Ronin
module Web
#
# Extends [Spidr::Agent](http://rubydoc.info/gems/spidr/Agent) with
# [Ronin::UI::Output::Helpers](http://rubydoc.info/gems/ronin/Ronin/UI/Output/Helpers).
#
class Spider < Spidr::Agent
include UI::Output::Helpers
#
# Creates a new Spider object.
#
# @param [Hash] options
# Additional options.
#
# @option options [Hash] :proxy (Web.proxy)
# The proxy to use while spidering.
#
# @option options [String] :user_agent (Web.user_agent)
# The User-Agent string to send.
#
# @option options [String] :referer
# The referer URL to send.
#
# @option options [Integer] :delay (0)
# Duration in seconds to pause between spidering each link.
#
# @option options [Array] :schemes (['http', 'https'])
# The list of acceptable URI schemes to visit.
# The `https` scheme will be ignored if `net/https` cannot be
# loaded.
#
# @option options [String] :host
# The host-name to visit.
#
# @option options [Array] :hosts
# The patterns which match the host-names to visit.
#
# @option options [Array] :ignore_hosts
# The patterns which match the host-names to not visit.
#
# @option options [Array] :ports
# The patterns which match the ports to visit.
#
# @option options [Array] :ignore_ports
# The patterns which match the ports to not visit.
#
# @option options [Array] :links
# The patterns which match the links to visit.
#
# @option options [Array] :ignore_links
# The patterns which match the links to not visit.
#
# @option options [Array] :exts
# The patterns which match the URI path extensions to visit.
#
# @option options [Array] :ignore_exts
# The patterns which match the URI path extensions to not visit.
#
# @yield [spider]
# If a block is given, it will be passed the newly created spider.
#
# @yieldparam [Spider] spider
# The newly created spider.
#
# @see http://spidr.rubyforge.org/docs/classes/Spidr/Agent.html
#
# @api public
#
def initialize(options={})
options = {
:proxy => Web.proxy,
:user_agent => Web.user_agent
}.merge(options)
super(options)
every_url do |url|
print_info("Spidering #{url}")
end
yield self if block_given?
end
end
end
end