lib/ronin/web/cli/commands/spider.rb in ronin-web-1.0.2 vs lib/ronin/web/cli/commands/spider.rb in ronin-web-2.0.0.rc1
- old
+ new
@@ -1,10 +1,10 @@
# frozen_string_literal: true
#
# ronin-web - A collection of useful web helper methods and commands.
#
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
#
# ronin-web is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
@@ -17,11 +17,11 @@
# You should have received a copy of the GNU General Public License
# along with ronin-web. If not, see <https://www.gnu.org/licenses/>.
#
require 'ronin/web/cli/command'
-require 'ronin/web/spider'
+require 'ronin/web/cli/spider_options'
require 'ronin/web/spider/archive'
require 'ronin/web/spider/git_archive'
require 'ronin/support/network/http/user_agents'
require 'command_kit/colors'
@@ -39,30 +39,37 @@
#
# ronin-web spider [options] {--host HOST | --domain DOMAIN | --site URL}
#
# ## Options
#
- # -v, --verbose Enables verbose output
+ # --host HOST Spiders the specific HOST
+ # --domain DOMAIN Spiders the whole domain
+ # --site URL Spiders the website, starting at the URL
# --open-timeout SECS Sets the connection open timeout
# --read-timeout SECS Sets the read timeout
# --ssl-timeout SECS Sets the SSL connection timeout
# --continue-timeout SECS Sets the continue timeout
# --keep-alive-timeout SECS Sets the connection keep alive timeout
- # -P, --proxy PROXY Sets the proxy to use.
+ # -P, --proxy PROXY Sets the proxy to use
# -H, --header NAME: VALUE Sets a default header
# --host-header NAME=VALUE Sets a default header
+ # -U, --user-agent-string STRING The User-Agent string to use
# -u chrome-linux|chrome-macos|chrome-windows|chrome-iphone|chrome-ipad|chrome-android|firefox-linux|firefox-macos|firefox-windows|firefox-iphone|firefox-ipad|firefox-android|safari-macos|safari-iphone|safari-ipad|edge,
# --user-agent The User-Agent to use
- # -U, --user-agent-string STRING The User-Agent string to use
# -R, --referer URL Sets the Referer URL
# --delay SECS Sets the delay in seconds between each request
# -l, --limit COUNT Only spiders up to COUNT pages
# -d, --max-depth DEPTH Only spiders up to max depth
# --enqueue URL Adds the URL to the queue
# --visited URL Marks the URL as previously visited
# --strip-fragments Enables/disables stripping the fragment component of every URL
# --strip-query Enables/disables stripping the query component of every URL
+ # --visit-scheme SCHEME Visit URLs with the URI scheme
+ # --visit-schemes-like /REGEX/ Visit URLs with URI schemes that match the REGEX
+ # --ignore-scheme SCHEME Ignore the URLs with the URI scheme
+ # --ignore-schemes-like /REGEX/
+ # Ignore the URLs with URI schemes matching the REGEX
# --visit-host HOST Visit URLs with the matching host name
# --visit-hosts-like /REGEX/ Visit URLs with hostnames that match the REGEX
# --ignore-host HOST Ignore the host name
# --ignore-hosts-like /REGEX/ Ignore the host names matching the REGEX
# --visit-port PORT Visit URLs with the matching port number
@@ -76,324 +83,51 @@
# --visit-ext FILE_EXT Visit URLs with the matching file ext
# --visit-exts-like /REGEX/ Visit URLs with file exts that match the REGEX
# --ignore-ext FILE_EXT Ignore the URLs with the file ext
# --ignore-exts-like /REGEX/ Ignore URLs with file exts matching the REGEX
# -r, --robots Specifies whether to honor robots.txt
- # --host HOST Spiders the specific HOST
- # --domain DOMAIN Spiders the whole domain
- # --site URL Spiders the website, starting at the URL
- # --print-status Print the status codes for each URL
+ # -v, --verbose Enables verbose output
+ # --print-stauts Print the status codes for each URL
# --print-headers Print response headers for each URL
# --print-header NAME Prints a specific header
# --history FILE The history file
# --archive DIR Archive every visited page to the DIR
# --git-archive DIR Archive every visited page to the git repository
# -X, --xpath XPATH Evaluates the XPath on each HTML page
# -C, --css-path XPATH Evaluates the CSS-path on each HTML page
+ # --print-hosts Print all discovered hostnames
+ # --print-certs Print all encountered SSL/TLS certificates
+ # --save-certs Saves all encountered SSL/TLS certificates
+ # --print-js-strings Print all JavaScript strings
+ # --print-js-url-strings Print URL strings found in JavaScript
+ # --print-js-path-strings Print path strings found in JavaScript
+ # --print-js-absolute-path-strings
+ # Only print absolute path strings found in JavaScript
+ # --print-js-relative-path-strings
+ # Only print relative path strings found in JavaScript
+ # --print-html-comments Print HTML comments
+ # --print-js-comments Print JavaScript comments
+ # --print-comments Print all HTML and JavaScript comments
# -h, --help Print help information
#
# ## Examples
#
# ronin-web spider --host scanme.nmap.org
# ronin-web spider --domain nmap.org
# ronin-web spider --site https://scanme.nmap.org/
#
class Spider < Command
+ include SpiderOptions
include CommandKit::Colors
include CommandKit::Printing::Indent
include CommandKit::Options::Verbose
usage '[options] {--host HOST | --domain DOMAIN | --site URL}'
- option :open_timeout, value: {
- type: Integer,
- usage: 'SECS',
- default: Spidr.open_timeout
- },
- desc: 'Sets the connection open timeout'
+ option :print_stauts, desc: 'Print the status codes for each URL'
- option :read_timeout, value: {
- type: Integer,
- usage: 'SECS',
- default: Spidr.read_timeout
- },
- desc: 'Sets the read timeout'
-
- option :ssl_timeout, value: {
- type: Integer,
- usage: 'SECS',
- default: Spidr.ssl_timeout
- },
- desc: 'Sets the SSL connection timeout'
-
- option :continue_timeout, value: {
- type: Integer,
- usage: 'SECS',
- default: Spidr.continue_timeout
- },
- desc: 'Sets the continue timeout'
-
- option :keep_alive_timeout, value: {
- type: Integer,
- usage: 'SECS',
- default: Spidr.keep_alive_timeout
- },
- desc: 'Sets the connection keep alive timeout'
-
- option :proxy, short: '-P',
- value: {
- type: String,
- usage: 'PROXY'
- },
- desc: 'Sets the proxy to use'
-
- option :header, short: '-H',
- value: {
- type: /\A[^\s:]+:.*\z/,
- usage: 'NAME: VALUE'
- },
- desc: 'Sets a default header' do |header|
- name, value = header.split(/:\s*/,2)
-
- @default_headers[name] = value
- end
-
- option :host_header, value: {
- type: /\A[^\s=]+=[^\s=]+\z/,
- usage: 'NAME=VALUE'
- },
- desc: 'Sets a default header' do |name_value|
- name, value = name_value.split('=',2)
-
- @host_headers[name] = value
- end
-
- option :user_agent, value: {
- type: String,
- usage: 'USER-AGENT'
- },
- desc: 'Sets the User-Agent string'
-
- option :user_agent_string, short: '-U',
- value: {
- type: String,
- usage: 'STRING'
- },
- desc: 'The User-Agent string to use' do |ua|
- @user_agent = ua
- end
-
- option :user_agent, short: '-u',
- value: {
- type: Support::Network::HTTP::UserAgents::ALIASES.transform_keys { |key|
- key.to_s.tr('_','-')
- }
- },
- desc: 'The User-Agent to use' do |name|
- @user_agent = name
- end
-
- option :referer, short: '-R',
- value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Sets the Referer URL'
-
- option :delay, short: '-d',
- value: {
- type: Numeric,
- usage: 'SECS'
- },
- desc: 'Sets the delay in seconds between each request'
-
- option :limit, short: '-l',
- value: {
- type: Integer,
- usage: 'COUNT'
- },
- desc: 'Only spiders up to COUNT pages'
-
- option :max_depth, short: '-d',
- value: {
- type: Integer,
- usage: 'DEPTH'
- },
- desc: 'Only spiders up to max depth'
-
- option :enqueue, value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Adds the URL to the queue' do |url|
- @queue << url
- end
-
- option :visited, value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Marks the URL as previously visited' do |url|
- @history << url
- end
-
- option :strip_fragments, desc: 'Enables/disables stripping the fragment component of every URL'
-
- option :strip_query, desc: 'Enables/disables stripping the query component of every URL'
-
- option :visit_host, value: {
- type: String,
- usage: 'HOST'
- },
- desc: 'Visit URLs with the matching host name' do |host|
- @visit_hosts << host
- end
-
- option :visit_hosts_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Visit URLs with hostnames that match the REGEX' do |regex|
- @visit_hosts << regex
- end
-
- option :ignore_host, value: {
- type: String,
- usage: 'HOST'
- },
- desc: 'Ignore the host name' do |host|
- @ignore_hosts << host
- end
-
- option :ignore_hosts_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Ignore the host names matching the REGEX' do |regex|
- @ignore_hosts << regex
- end
-
- option :visit_port, value: {
- type: Integer,
- usage: 'PORT'
- },
- desc: 'Visit URLs with the matching port number' do |port|
- @visit_ports << port
- end
-
- option :visit_ports_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Visit URLs with port numbers that match the REGEX' do |regex|
- @visit_ports << regex
- end
-
- option :ignore_port, value: {
- type: Integer,
- usage: 'PORT'
- },
- desc: 'Ignore the port number' do |port|
- @ignore_ports << port
- end
-
- option :ignore_ports_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Ignore the port numbers matching the REGEXP' do |regex|
- @ignore_ports << regex
- end
-
- option :visit_link, value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Visit the URL' do |link|
- @visit_links << link
- end
-
- option :visit_links_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Visit URLs that match the REGEX' do |regex|
- @visit_links << regex
- end
-
- option :ignore_link, value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Ignore the URL' do |link|
- @ignore_links << link
- end
-
- option :ignore_links_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Ignore URLs matching the REGEX' do |regex|
- @ignore_links << regex
- end
-
- option :visit_ext, value: {
- type: String,
- usage: 'FILE_EXT'
- },
- desc: 'Visit URLs with the matching file ext' do |ext|
- @visit_exts << ext
- end
-
- option :visit_exts_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Visit URLs with file exts that match the REGEX' do |regex|
- @visit_exts << regex
- end
-
- option :ignore_ext, value: {
- type: String,
- usage: 'FILE_EXT'
- },
- desc: 'Ignore the URLs with the file ext' do |ext|
- @ignore_exts << ext
- end
-
- option :ignore_exts_like, value: {
- type: Regexp,
- usage: '/REGEX/'
- },
- desc: 'Ignore URLs with file exts matching the REGEX' do |regex|
- @ignore_exts << regex
- end
-
- option :robots, short: '-r',
- desc: 'Specifies whether to honor robots.txt'
-
- option :host, value: {
- type: String,
- usage: 'HOST'
- },
- desc: 'Spiders the specific HOST'
-
- option :domain, value: {
- type: String,
- usage: 'DOMAIN'
- },
- desc: 'Spiders the whole domain'
-
- option :site, value: {
- type: String,
- usage: 'URL'
- },
- desc: 'Spiders the website, starting at the URL'
-
- option :print_status, desc: 'Print the status codes for each URL'
-
option :print_headers, desc: 'Print response headers for each URL'
option :print_header, value: {
type: String,
usage: 'NAME'
@@ -438,10 +172,18 @@
option :save_certs, desc: 'Saves all encountered SSL/TLS certificates'
option :print_js_strings, desc: 'Print all JavaScript strings'
+ option :print_js_url_strings, desc: 'Print URL strings found in JavaScript'
+
+ option :print_js_path_strings, desc: 'Print path strings found in JavaScript'
+
+ option :print_js_absolute_path_strings, desc: 'Only print absolute path strings found in JavaScript'
+
+ option :print_js_relative_path_strings, desc: 'Only print relative path strings found in JavaScript'
+
option :print_html_comments, desc: 'Print HTML comments'
option :print_js_comments, desc: 'Print JavaScript comments'
option :print_comments, desc: 'Print all HTML and JavaScript comments'
@@ -454,103 +196,11 @@
"--site https://scanme.nmap.org/"
]
man_page 'ronin-web-spider.1'
- # The default HTTP headers to send with every request.
#
- # @return [Hash{String => String}]
- attr_reader :default_headers
-
- # The mapping of custom `Host` headers.
- #
- # @return [Hash{String => String}]
- attr_reader :host_headers
-
- # The pre-existing queue of URLs to start spidering with.
- #
- # @return [Array<String>]
- attr_reader :queue
-
- # The pre-existing of previously visited URLs to start spidering with.
- #
- # @return [Array<String>]
- attr_reader :history
-
- # The schemes to visit.
- #
- # @return [Array<String>]
- attr_reader :visit_schemes
-
- # The hosts to visit.
- #
- # @return [Array<String, Regexp>]
- attr_reader :visit_hosts
-
- # The port numbers to visit.
- #
- # @return [Array<Integer, Regexp>]
- attr_reader :visit_ports
-
- # The links to visit.
- #
- # @return [Array<String, Regexp>]
- attr_reader :visit_links
-
- # The URL file extensions to visit.
- #
- # @return [Array<String, Regexp>]
- attr_reader :visit_exts
-
- # The hosts to ignore.
- #
- # @return [Array<String, Regexp>]
- attr_reader :ignore_hosts
-
- # The port numbers to ignore.
- #
- # @return [Array<Integer, Regexp>]
- attr_reader :ignore_ports
-
- # The links to ignore.
- #
- # @return [Array<String, Regexp>]
- attr_reader :ignore_links
-
- # The URL file extensions to ignore.
- #
- # @return [Array<String, Regexp>]
- attr_reader :ignore_exts
-
- #
- # Initializes the spider command.
- #
- # @param [Hash{Symbol => Object}] kwargs
- # Additional keyword arguments.
- #
- def initialize(**kwargs)
- super(**kwargs)
-
- @default_headers = {}
- @host_headers = {}
-
- @queue = []
- @history = []
-
- @visit_schemes = []
- @visit_hosts = []
- @visit_ports = []
- @visit_links = []
- @visit_exts = []
-
- @ignore_hosts = []
- @ignore_ports = []
- @ignore_links = []
- @ignore_exts = []
- end
-
- #
# Runs the `ronin-web spider` command.
#
def run
archive = if options[:archive]
Web::Spider::Archive.open(options[:archive])
@@ -644,10 +294,34 @@
agent.every_js_string do |string|
print_content string
end
end
+ if options[:print_js_url_strings]
+ agent.every_js_url_string do |url|
+ print_content url
+ end
+ end
+
+ if options[:print_js_path_strings]
+ agent.every_js_path_string do |path|
+ print_content path
+ end
+ end
+
+ if options[:print_js_absolute_path_strings]
+ agent.every_js_absolute_path_string do |path|
+ print_content path
+ end
+ end
+
+ if options[:print_js_relative_path_strings]
+ agent.every_js_relative_path_string do |path|
+ print_content path
+ end
+ end
+
if options[:print_html_comments]
agent.every_html_comment do |comment|
print_content comment
end
end
@@ -661,89 +335,9 @@
if options[:print_comments]
agent.every_comment do |comment|
print_content comment
end
end
- end
-
- #
- # Creates a new web spider agent.
- #
- # @yield [agent]
- # The given block will be given the newly created and configured
- # web spider agent.
- #
- # @yieldparam [Ronin::Web::Spider::Agent] agent
- # The newly created web spider agent.
- #
- # @return [Ronin::Web::Spider::Agent]
- # The newly created web spider agent, after the agent has completed
- # it's spidering.
- #
- def new_agent(&block)
- if options[:host]
- Web::Spider.host(options[:host],**agent_kwargs,&block)
- elsif options[:domain]
- Web::Spider.domain(options[:domain],**agent_kwargs,&block)
- elsif options[:site]
- Web::Spider.site(options[:site],**agent_kwargs,&block)
- else
- print_error "must specify --host, --domain, or --site"
- exit(-1)
- end
- end
-
- #
- # Builds keyword arguments for `Ronin::Web::Spider::Agent#initialize`.
- #
- # @return [Hash{Symbol => Object}]
- # The keyword arguments for `Ronin::Web::Spider::Agent#initialize`.
- #
- def agent_kwargs
- kwargs = {}
-
- kwargs[:proxy] = options[:proxy] if options[:proxy]
-
- unless @default_headers.empty?
- kwargs[:default_headers] = @default_headers
- end
-
- unless @host_headers.empty?
- kwargs[:host_headers] = @host_headers
- end
-
- kwargs[:user_agent] = @user_agent if @user_agent
- kwargs[:referer] = options[:referer] if options[:referer]
-
- kwargs[:delay] = options[:delay] if options[:delay]
- kwargs[:limit] = options[:limit] if options[:limit]
- kwargs[:max_depth] = options[:max_depth] if options[:max_depth]
-
- kwargs[:queue] = @queue unless @queue.empty?
- kwargs[:history] = @history unless @history.empty?
-
- if options.has_key?(:strip_fragments)
- kwargs[:strip_fragments] = options[:strip_fragments]
- end
-
- if options.has_key?(:strip_query)
- kwargs[:strip_query] = options[:strip_query]
- end
-
- kwargs[:schemes] = @visit_schemes unless @visit_schemes.empty?
- kwargs[:hosts] = @visit_hosts unless @visit_hosts.empty?
- kwargs[:ports] = @visit_ports unless @visit_ports.empty?
- kwargs[:links] = @visit_links unless @visit_links.empty?
- kwargs[:exts] = @visit_exts unless @visit_exts.empty?
-
- kwargs[:ignore_hosts] = @ignore_hosts unless @ignore_hosts.empty?
- kwargs[:ignore_ports] = @ignore_ports unless @ignore_ports.empty?
- kwargs[:ignore_links] = @ignore_links unless @ignore_links.empty?
- kwargs[:ignore_exts] = @ignore_exts unless @ignore_exts.empty?
-
- kwargs[:robots] = options[:robots] if options.has_key?(:robots)
-
- return kwargs
end
#
# Prints the status of a page.
#