# frozen_string_literal: true
#
# ronin-web - A collection of useful web helper methods and commands.
#
# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
#
# ronin-web is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ronin-web is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ronin-web. If not, see .
#
require 'ronin/web/spider'
require 'ronin/support/network/http/user_agents'
module Ronin
module Web
class CLI
#
# Adds options for spidering a website.
#
# @since 2.0.0
#
module SpiderOptions
#
# Adds options for configuring a web spider and spidering a website.
#
# @param [Class] command
# The command class including {SpiderOptions}.
#
def self.included(command)
command.usage '[options] {--host HOST | --domain DOMAIN | --site URL}'
command.option :host, value: {
type: String,
usage: 'HOST'
},
desc: 'Spiders the specific HOST'
command.option :domain, value: {
type: String,
usage: 'DOMAIN'
},
desc: 'Spiders the whole domain'
command.option :site, value: {
type: String,
usage: 'URL'
},
desc: 'Spiders the website, starting at the URL'
command.option :open_timeout, value: {
type: Integer,
usage: 'SECS',
default: Spidr.open_timeout
},
desc: 'Sets the connection open timeout' do |timeout|
self.open_timeout = timeout
end
command.option :read_timeout, value: {
type: Integer,
usage: 'SECS',
default: Spidr.read_timeout
},
desc: 'Sets the read timeout' do |timeout|
self.read_timeout = timeout
end
command.option :ssl_timeout, value: {
type: Integer,
usage: 'SECS',
default: Spidr.ssl_timeout
},
desc: 'Sets the SSL connection timeout' do |timeout|
self.ssl_timeout = timeout
end
command.option :continue_timeout, value: {
type: Integer,
usage: 'SECS',
default: Spidr.continue_timeout
},
desc: 'Sets the continue timeout' do |timeout|
self.continue_timeout = timeout
end
command.option :keep_alive_timeout, value: {
type: Integer,
usage: 'SECS',
default: Spidr.keep_alive_timeout
},
desc: 'Sets the connection keep alive timeout' do |timeout|
self.keep_alive_timeout = timeout
end
command.option :proxy, short: '-P',
value: {
type: String,
usage: 'PROXY'
},
desc: 'Sets the proxy to use' do |proxy|
self.proxy = proxy
end
command.option :header, short: '-H',
value: {
type: /\A[^\s:]+:.*\z/,
usage: 'NAME: VALUE'
},
desc: 'Sets a default header' do |header|
name, value = header.split(/:\s*/,2)
self.default_headers[name] = value
end
command.option :host_header, value: {
type: /\A[^\s=]+=[^\s=]+\z/,
usage: 'NAME=VALUE'
},
desc: 'Sets a default header' do |name_value|
name, value = name_value.split('=',2)
self.host_headers[name] = value
end
command.option :user_agent_string, short: '-U',
value: {
type: String,
usage: 'STRING'
},
desc: 'The User-Agent string to use' do |ua|
self.user_agent = ua
end
command.option :user_agent, short: '-u',
value: {
type: Support::Network::HTTP::UserAgents::ALIASES.transform_keys { |key|
key.to_s.tr('_','-')
}
},
desc: 'The User-Agent to use' do |name|
self.user_agent = name
end
command.option :referer, short: '-R',
value: {
type: String,
usage: 'URL'
},
desc: 'Sets the Referer URL' do |referer|
self.referer = referer
end
command.option :delay, short: '-d',
value: {
type: Numeric,
usage: 'SECS'
},
desc: 'Sets the delay in seconds between each request' do |delay|
self.delay = delay
end
command.option :limit, short: '-l',
value: {
type: Integer,
usage: 'COUNT'
},
desc: 'Only spiders up to COUNT pages' do |limit|
self.limit = limit
end
command.option :max_depth, short: '-d',
value: {
type: Integer,
usage: 'DEPTH'
},
desc: 'Only spiders up to max depth' do |depth|
self.max_depth = depth
end
command.option :enqueue, value: {
type: String,
usage: 'URL'
},
desc: 'Adds the URL to the queue' do |url|
self.queue << url
end
command.option :visited, value: {
type: String,
usage: 'URL'
},
desc: 'Marks the URL as previously visited' do |url|
self.history << url
end
command.option :strip_fragments, desc: 'Enables/disables stripping the fragment component of every URL' do
self.strip_fragments = true
end
command.option :strip_query, desc: 'Enables/disables stripping the query component of every URL' do
self.strip_query = true
end
command.option :visit_scheme, value: {
type: String,
usage: 'SCHEME'
},
desc: 'Visit URLs with the URI scheme' do |scheme|
self.visit_schemes << scheme
end
command.option :visit_schemes_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Visit URLs with URI schemes that match the REGEX' do |regex|
self.visit_schemes << regex
end
command.option :ignore_scheme, value: {
type: String,
usage: 'SCHEME'
},
desc: 'Ignore the URLs with the URI scheme' do |scheme|
self.ignore_schemes << scheme
end
command.option :ignore_schemes_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Ignore the URLs with URI schemes matching the REGEX' do |regex|
self.ignore_schemes << regex
end
command.option :visit_host, value: {
type: String,
usage: 'HOST'
},
desc: 'Visit URLs with the matching host name' do |host|
self.visit_hosts << host
end
command.option :visit_hosts_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Visit URLs with hostnames that match the REGEX' do |regex|
self.visit_hosts << regex
end
command.option :ignore_host, value: {
type: String,
usage: 'HOST'
},
desc: 'Ignore the host name' do |host|
self.ignore_hosts << host
end
command.option :ignore_hosts_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Ignore the host names matching the REGEX' do |regex|
self.ignore_hosts << regex
end
command.option :visit_port, value: {
type: Integer,
usage: 'PORT'
},
desc: 'Visit URLs with the matching port number' do |port|
self.visit_ports << port
end
command.option :visit_ports_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Visit URLs with port numbers that match the REGEX' do |regex|
self.visit_ports << regex
end
command.option :ignore_port, value: {
type: Integer,
usage: 'PORT'
},
desc: 'Ignore the port number' do |port|
self.ignore_ports << port
end
command.option :ignore_ports_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Ignore the port numbers matching the REGEXP' do |regex|
self.ignore_ports << regex
end
command.option :visit_link, value: {
type: String,
usage: 'URL'
},
desc: 'Visit the URL' do |link|
self.visit_links << link
end
command.option :visit_links_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Visit URLs that match the REGEX' do |regex|
self.visit_links << regex
end
command.option :ignore_link, value: {
type: String,
usage: 'URL'
},
desc: 'Ignore the URL' do |link|
self.ignore_links << link
end
command.option :ignore_links_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Ignore URLs matching the REGEX' do |regex|
self.ignore_links << regex
end
command.option :visit_ext, value: {
type: String,
usage: 'FILE_EXT'
},
desc: 'Visit URLs with the matching file ext' do |ext|
self.visit_exts << ext
end
command.option :visit_exts_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Visit URLs with file exts that match the REGEX' do |regex|
self.visit_exts << regex
end
command.option :ignore_ext, value: {
type: String,
usage: 'FILE_EXT'
},
desc: 'Ignore the URLs with the file ext' do |ext|
self.ignore_exts << ext
end
command.option :ignore_exts_like, value: {
type: Regexp,
usage: '/REGEX/'
},
desc: 'Ignore URLs with file exts matching the REGEX' do |regex|
self.ignore_exts << regex
end
command.option :robots, short: '-r',
desc: 'Specifies whether to honor robots.txt' do
self.robots = true
end
end
# Keyword arguments to initialize a new `Spidr::Agent`.
#
# @return [Hash{Symbol => Object}]
#
# @since 2.0.0
attr_reader :agent_kwargs
#
# Initializes the command.
#
# @param [Hash{Symbol => Object}] kwargs
# Additional keyword arguments.
#
def initialize(**kwargs)
super(**kwargs)
@agent_kwargs = {}
end
#
# Creates a new web spider agent.
#
# @yield [agent]
# The given block will be given the newly created and configured
# web spider agent.
#
# @yieldparam [Ronin::Web::Spider::Agent] agent
# The newly created web spider agent.
#
# @return [Ronin::Web::Spider::Agent]
# The newly created web spider agent, after the agent has completed
# it's spidering.
#
def new_agent(&block)
if options[:host]
Web::Spider.host(options[:host],**agent_kwargs,&block)
elsif options[:domain]
Web::Spider.domain(options[:domain],**agent_kwargs,&block)
elsif options[:site]
Web::Spider.site(options[:site],**agent_kwargs,&block)
else
print_error "must specify --host, --domain, or --site"
exit(-1)
end
end
#
# The open connection timeout.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def open_timeout
@agent_kwargs[:open_timeout]
end
#
# Sets the open connection timeout.
#
# @param [Integer] new_timeout
#
# @return [Integer]
#
# @since 2.0.0
#
def open_timeout=(new_timeout)
@agent_kwargs[:open_timeout] = new_timeout
end
#
# The read timeout.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def read_timeout
@agent_kwargs[:read_timeout]
end
#
# Sets the read timeout.
#
# @param [Integer] new_timeout
#
# @return [Integer]
#
# @since 2.0.0
#
def read_timeout=(new_timeout)
@agent_kwargs[:read_timeout] = new_timeout
end
#
# The SSL timeout.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def ssl_timeout
@agent_kwargs[:ssl_timeout]
end
#
# Sets the SSL timeout.
#
# @param [Integer] new_timeout
#
# @return [Integer]
#
# @since 2.0.0
#
def ssl_timeout=(new_timeout)
@agent_kwargs[:ssl_timeout] = new_timeout
end
#
# The continue timeout.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def continue_timeout
@agent_kwargs[:continue_timeout]
end
#
# Sets the continue timeout.
#
# @param [Integer] new_timeout
#
# @return [Integer]
#
# @since 2.0.0
#
def continue_timeout=(new_timeout)
@agent_kwargs[:continue_timeout] = new_timeout
end
#
# The `Keep-Alive` timeout.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def keep_alive_timeout
@agent_kwargs[:keep_alive_timeout]
end
#
# Sets the `Keep-Alive` timeout.
#
# @param [Integer] new_timeout
#
# @return [Integer]
#
# @since 2.0.0
#
def keep_alive_timeout=(new_timeout)
@agent_kwargs[:keep_alive_timeout] = new_timeout
end
#
# The proxy to use for spidering.
#
# @return [String, nil]
#
# @since 0.2.0
#
def proxy
@agent_kwargs[:proxy]
end
#
# Sets the proxy to use for spidering.
#
# @param [String] new_proxy
# The new proxy URI.
#
# @return [String]
#
# @since 2.0.0
#
def proxy=(new_proxy)
@agent_kwargs[:proxy] = new_proxy
end
#
# The default headers to send with every request.
#
# @return [Hash{String => String}]
#
# @since 2.0.0
#
def default_headers
@agent_kwargs[:default_headers] ||= {}
end
#
# The default `Host` headers to send with every request.
#
# @return [Hash{String => String}]
#
# @since 2.0.0
#
def host_headers
@agent_kwargs[:host_headers] ||= {}
end
#
# Sets the new `User-Agent` header to use for spidering.
#
# @return [String, nil]
#
# @since 2.0.0
#
def user_agent
@agent_kwargs[:user_agent]
end
#
# Sets the new `User-Agent` header to use for spidering.
#
# @param [String] new_user_agent
#
# @return [String]
#
# @since 2.0.0
#
def user_agent=(new_user_agent)
@agent_kwargs[:user_agent] = new_user_agent
end
#
# The `Referer` header to use for spidering.
#
# @return [String, nil]
#
# @since 2.0.0
#
def referer
@agent_kwargs[:referer]
end
#
# Sets the `Referer` header to use for spidering.
#
# @param [String] new_referer
#
# @return [String, nil]
#
# @since 2.0.0
#
def referer=(new_referer)
@agent_kwargs[:referer] = new_referer
end
#
# The amount of seconds to pause between each request.
#
# @return [Integer, Float, nil]
#
# @since 2.0.0
#
def delay
@agent_kwargs[:delay]
end
#
# Sets the amount of seconds to pause between each request.
#
# @param [Integer, Float] new_delay
#
# @return [Integer, Float]
#
# @since 2.0.0
#
def delay=(new_delay)
@agent_kwargs[:delay] = new_delay
end
#
# The limit to how many URLs to visit.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def limit
@agent_kwargs[:limit]
end
#
# Sets the limit of how many URLs to visit.
#
# @param [Integer] new_limit
#
# @return [Integer]
#
# @since 2.0.0
#
def limit=(new_limit)
@agent_kwargs[:limit] = new_limit
end
#
# The maximum depth to spider.
#
# @return [Integer, nil]
#
# @since 2.0.0
#
def max_depth
@agent_kwargs[:max_depth]
end
#
# Sets the maximum depth to spider.
#
# @param [Integer] new_max_depth
#
# @return [Integer]
#
# @since 2.0.0
#
def max_depth=(new_max_depth)
@agent_kwargs[:max_depth] = new_max_depth
end
#
# The pre-existing queue of URLs to start spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def queue
@agent_kwargs[:queue] ||= []
end
#
# The pre-existing history of URLs that have already been spidered.
#
# @return [Array]
#
# @since 2.0.0
#
def history
@agent_kwargs[:history] ||= []
end
#
# Whether to strip the `#fragment` components of links.
#
# @return [Boolean]
#
# @since 2.0.0
#
def strip_fragments
@agent_kwargs[:strip_fragments]
end
#
# Sets whether to strip the `#fragment` components of links.
#
# @param [Boolean] new_value
#
# @return [Boolean]
#
# @since 2.0.0
#
def strip_fragments=(new_value)
@agent_kwargs[:strip_fragments] = new_value
end
#
# Whether to strip the `?query` components of links.
#
# @return [Boolean]
#
# @since 2.0.0
#
def strip_query
@agent_kwargs[:strip_query]
end
#
# Sets whether to strip the `?query` components of links.
#
# @param [Boolean] new_value
#
# @return [Boolean]
#
# @since 2.0.0
#
def strip_query=(new_value)
@agent_kwargs[:strip_query] = new_value
end
#
# The list of URI schemes to allow spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def visit_schemes
@agent_kwargs[:schemes] ||= []
end
#
# The list of URI hosts to allow spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def visit_hosts
@agent_kwargs[:hosts] ||= []
end
#
# The list of URI ports to allow spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def visit_ports
@agent_kwargs[:ports] ||= []
end
#
# The list of URI links to allow spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def visit_links
@agent_kwargs[:links] ||= []
end
#
# The list of URI file extensions to allow spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def visit_exts
@agent_kwargs[:exts] ||= []
end
#
# The list of URI schemes to ignore while spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def ignore_schemes
@agent_kwargs[:ignore_schemes] ||= []
end
#
# The list of URI hosts to ignore while spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def ignore_hosts
@agent_kwargs[:ignore_hosts] ||= []
end
#
# The list of URI ports to ignore while spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def ignore_ports
@agent_kwargs[:ignore_ports] ||= []
end
#
# The list of URI links to ignore while spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def ignore_links
@agent_kwargs[:ignore_links] ||= []
end
#
# The list of URI file extensions to ignore while spidering.
#
# @return [Array]
#
# @since 2.0.0
#
def ignore_exts
@agent_kwargs[:ignore_exts] ||= []
end
#
# Whether to honor the `robots.txt` file while spidering.
#
# @return [Boolean]
#
# @since 2.0.0
#
def robots
@agent_kwargs[:robots]
end
#
# Sets whether to honor the `robots.txt` file while spidering.
#
# @param [Boolean] new_value
#
# @return [Boolean]
#
# @since 2.0.0
#
def robots=(new_value)
@agent_kwargs[:robots] = new_value
end
end
end
end
end