lib/ronin/web/cli/commands/spider.rb in ronin-web-1.0.2 vs lib/ronin/web/cli/commands/spider.rb in ronin-web-2.0.0.rc1

- old
+ new

@@ -1,10 +1,10 @@ # frozen_string_literal: true # # ronin-web - A collection of useful web helper methods and commands. # -# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com) +# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com) # # ronin-web is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. @@ -17,11 +17,11 @@ # You should have received a copy of the GNU General Public License # along with ronin-web. If not, see <https://www.gnu.org/licenses/>. # require 'ronin/web/cli/command' -require 'ronin/web/spider' +require 'ronin/web/cli/spider_options' require 'ronin/web/spider/archive' require 'ronin/web/spider/git_archive' require 'ronin/support/network/http/user_agents' require 'command_kit/colors' @@ -39,30 +39,37 @@ # # ronin-web spider [options] {--host HOST | --domain DOMAIN | --site URL} # # ## Options # - # -v, --verbose Enables verbose output + # --host HOST Spiders the specific HOST + # --domain DOMAIN Spiders the whole domain + # --site URL Spiders the website, starting at the URL # --open-timeout SECS Sets the connection open timeout # --read-timeout SECS Sets the read timeout # --ssl-timeout SECS Sets the SSL connection timeout # --continue-timeout SECS Sets the continue timeout # --keep-alive-timeout SECS Sets the connection keep alive timeout - # -P, --proxy PROXY Sets the proxy to use. + # -P, --proxy PROXY Sets the proxy to use # -H, --header NAME: VALUE Sets a default header # --host-header NAME=VALUE Sets a default header + # -U, --user-agent-string STRING The User-Agent string to use # -u chrome-linux|chrome-macos|chrome-windows|chrome-iphone|chrome-ipad|chrome-android|firefox-linux|firefox-macos|firefox-windows|firefox-iphone|firefox-ipad|firefox-android|safari-macos|safari-iphone|safari-ipad|edge, # --user-agent The User-Agent to use - # -U, --user-agent-string STRING The User-Agent string to use # -R, --referer URL Sets the Referer URL # --delay SECS Sets the delay in seconds between each request # -l, --limit COUNT Only spiders up to COUNT pages # -d, --max-depth DEPTH Only spiders up to max depth # --enqueue URL Adds the URL to the queue # --visited URL Marks the URL as previously visited # --strip-fragments Enables/disables stripping the fragment component of every URL # --strip-query Enables/disables stripping the query component of every URL + # --visit-scheme SCHEME Visit URLs with the URI scheme + # --visit-schemes-like /REGEX/ Visit URLs with URI schemes that match the REGEX + # --ignore-scheme SCHEME Ignore the URLs with the URI scheme + # --ignore-schemes-like /REGEX/ + # Ignore the URLs with URI schemes matching the REGEX # --visit-host HOST Visit URLs with the matching host name # --visit-hosts-like /REGEX/ Visit URLs with hostnames that match the REGEX # --ignore-host HOST Ignore the host name # --ignore-hosts-like /REGEX/ Ignore the host names matching the REGEX # --visit-port PORT Visit URLs with the matching port number @@ -76,324 +83,51 @@ # --visit-ext FILE_EXT Visit URLs with the matching file ext # --visit-exts-like /REGEX/ Visit URLs with file exts that match the REGEX # --ignore-ext FILE_EXT Ignore the URLs with the file ext # --ignore-exts-like /REGEX/ Ignore URLs with file exts matching the REGEX # -r, --robots Specifies whether to honor robots.txt - # --host HOST Spiders the specific HOST - # --domain DOMAIN Spiders the whole domain - # --site URL Spiders the website, starting at the URL - # --print-status Print the status codes for each URL + # -v, --verbose Enables verbose output + # --print-stauts Print the status codes for each URL # --print-headers Print response headers for each URL # --print-header NAME Prints a specific header # --history FILE The history file # --archive DIR Archive every visited page to the DIR # --git-archive DIR Archive every visited page to the git repository # -X, --xpath XPATH Evaluates the XPath on each HTML page # -C, --css-path XPATH Evaluates the CSS-path on each HTML page + # --print-hosts Print all discovered hostnames + # --print-certs Print all encountered SSL/TLS certificates + # --save-certs Saves all encountered SSL/TLS certificates + # --print-js-strings Print all JavaScript strings + # --print-js-url-strings Print URL strings found in JavaScript + # --print-js-path-strings Print path strings found in JavaScript + # --print-js-absolute-path-strings + # Only print absolute path strings found in JavaScript + # --print-js-relative-path-strings + # Only print relative path strings found in JavaScript + # --print-html-comments Print HTML comments + # --print-js-comments Print JavaScript comments + # --print-comments Print all HTML and JavaScript comments # -h, --help Print help information # # ## Examples # # ronin-web spider --host scanme.nmap.org # ronin-web spider --domain nmap.org # ronin-web spider --site https://scanme.nmap.org/ # class Spider < Command + include SpiderOptions include CommandKit::Colors include CommandKit::Printing::Indent include CommandKit::Options::Verbose usage '[options] {--host HOST | --domain DOMAIN | --site URL}' - option :open_timeout, value: { - type: Integer, - usage: 'SECS', - default: Spidr.open_timeout - }, - desc: 'Sets the connection open timeout' + option :print_stauts, desc: 'Print the status codes for each URL' - option :read_timeout, value: { - type: Integer, - usage: 'SECS', - default: Spidr.read_timeout - }, - desc: 'Sets the read timeout' - - option :ssl_timeout, value: { - type: Integer, - usage: 'SECS', - default: Spidr.ssl_timeout - }, - desc: 'Sets the SSL connection timeout' - - option :continue_timeout, value: { - type: Integer, - usage: 'SECS', - default: Spidr.continue_timeout - }, - desc: 'Sets the continue timeout' - - option :keep_alive_timeout, value: { - type: Integer, - usage: 'SECS', - default: Spidr.keep_alive_timeout - }, - desc: 'Sets the connection keep alive timeout' - - option :proxy, short: '-P', - value: { - type: String, - usage: 'PROXY' - }, - desc: 'Sets the proxy to use' - - option :header, short: '-H', - value: { - type: /\A[^\s:]+:.*\z/, - usage: 'NAME: VALUE' - }, - desc: 'Sets a default header' do |header| - name, value = header.split(/:\s*/,2) - - @default_headers[name] = value - end - - option :host_header, value: { - type: /\A[^\s=]+=[^\s=]+\z/, - usage: 'NAME=VALUE' - }, - desc: 'Sets a default header' do |name_value| - name, value = name_value.split('=',2) - - @host_headers[name] = value - end - - option :user_agent, value: { - type: String, - usage: 'USER-AGENT' - }, - desc: 'Sets the User-Agent string' - - option :user_agent_string, short: '-U', - value: { - type: String, - usage: 'STRING' - }, - desc: 'The User-Agent string to use' do |ua| - @user_agent = ua - end - - option :user_agent, short: '-u', - value: { - type: Support::Network::HTTP::UserAgents::ALIASES.transform_keys { |key| - key.to_s.tr('_','-') - } - }, - desc: 'The User-Agent to use' do |name| - @user_agent = name - end - - option :referer, short: '-R', - value: { - type: String, - usage: 'URL' - }, - desc: 'Sets the Referer URL' - - option :delay, short: '-d', - value: { - type: Numeric, - usage: 'SECS' - }, - desc: 'Sets the delay in seconds between each request' - - option :limit, short: '-l', - value: { - type: Integer, - usage: 'COUNT' - }, - desc: 'Only spiders up to COUNT pages' - - option :max_depth, short: '-d', - value: { - type: Integer, - usage: 'DEPTH' - }, - desc: 'Only spiders up to max depth' - - option :enqueue, value: { - type: String, - usage: 'URL' - }, - desc: 'Adds the URL to the queue' do |url| - @queue << url - end - - option :visited, value: { - type: String, - usage: 'URL' - }, - desc: 'Marks the URL as previously visited' do |url| - @history << url - end - - option :strip_fragments, desc: 'Enables/disables stripping the fragment component of every URL' - - option :strip_query, desc: 'Enables/disables stripping the query component of every URL' - - option :visit_host, value: { - type: String, - usage: 'HOST' - }, - desc: 'Visit URLs with the matching host name' do |host| - @visit_hosts << host - end - - option :visit_hosts_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Visit URLs with hostnames that match the REGEX' do |regex| - @visit_hosts << regex - end - - option :ignore_host, value: { - type: String, - usage: 'HOST' - }, - desc: 'Ignore the host name' do |host| - @ignore_hosts << host - end - - option :ignore_hosts_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Ignore the host names matching the REGEX' do |regex| - @ignore_hosts << regex - end - - option :visit_port, value: { - type: Integer, - usage: 'PORT' - }, - desc: 'Visit URLs with the matching port number' do |port| - @visit_ports << port - end - - option :visit_ports_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Visit URLs with port numbers that match the REGEX' do |regex| - @visit_ports << regex - end - - option :ignore_port, value: { - type: Integer, - usage: 'PORT' - }, - desc: 'Ignore the port number' do |port| - @ignore_ports << port - end - - option :ignore_ports_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Ignore the port numbers matching the REGEXP' do |regex| - @ignore_ports << regex - end - - option :visit_link, value: { - type: String, - usage: 'URL' - }, - desc: 'Visit the URL' do |link| - @visit_links << link - end - - option :visit_links_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Visit URLs that match the REGEX' do |regex| - @visit_links << regex - end - - option :ignore_link, value: { - type: String, - usage: 'URL' - }, - desc: 'Ignore the URL' do |link| - @ignore_links << link - end - - option :ignore_links_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Ignore URLs matching the REGEX' do |regex| - @ignore_links << regex - end - - option :visit_ext, value: { - type: String, - usage: 'FILE_EXT' - }, - desc: 'Visit URLs with the matching file ext' do |ext| - @visit_exts << ext - end - - option :visit_exts_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Visit URLs with file exts that match the REGEX' do |regex| - @visit_exts << regex - end - - option :ignore_ext, value: { - type: String, - usage: 'FILE_EXT' - }, - desc: 'Ignore the URLs with the file ext' do |ext| - @ignore_exts << ext - end - - option :ignore_exts_like, value: { - type: Regexp, - usage: '/REGEX/' - }, - desc: 'Ignore URLs with file exts matching the REGEX' do |regex| - @ignore_exts << regex - end - - option :robots, short: '-r', - desc: 'Specifies whether to honor robots.txt' - - option :host, value: { - type: String, - usage: 'HOST' - }, - desc: 'Spiders the specific HOST' - - option :domain, value: { - type: String, - usage: 'DOMAIN' - }, - desc: 'Spiders the whole domain' - - option :site, value: { - type: String, - usage: 'URL' - }, - desc: 'Spiders the website, starting at the URL' - - option :print_status, desc: 'Print the status codes for each URL' - option :print_headers, desc: 'Print response headers for each URL' option :print_header, value: { type: String, usage: 'NAME' @@ -438,10 +172,18 @@ option :save_certs, desc: 'Saves all encountered SSL/TLS certificates' option :print_js_strings, desc: 'Print all JavaScript strings' + option :print_js_url_strings, desc: 'Print URL strings found in JavaScript' + + option :print_js_path_strings, desc: 'Print path strings found in JavaScript' + + option :print_js_absolute_path_strings, desc: 'Only print absolute path strings found in JavaScript' + + option :print_js_relative_path_strings, desc: 'Only print relative path strings found in JavaScript' + option :print_html_comments, desc: 'Print HTML comments' option :print_js_comments, desc: 'Print JavaScript comments' option :print_comments, desc: 'Print all HTML and JavaScript comments' @@ -454,103 +196,11 @@ "--site https://scanme.nmap.org/" ] man_page 'ronin-web-spider.1' - # The default HTTP headers to send with every request. # - # @return [Hash{String => String}] - attr_reader :default_headers - - # The mapping of custom `Host` headers. - # - # @return [Hash{String => String}] - attr_reader :host_headers - - # The pre-existing queue of URLs to start spidering with. - # - # @return [Array<String>] - attr_reader :queue - - # The pre-existing of previously visited URLs to start spidering with. - # - # @return [Array<String>] - attr_reader :history - - # The schemes to visit. - # - # @return [Array<String>] - attr_reader :visit_schemes - - # The hosts to visit. - # - # @return [Array<String, Regexp>] - attr_reader :visit_hosts - - # The port numbers to visit. - # - # @return [Array<Integer, Regexp>] - attr_reader :visit_ports - - # The links to visit. - # - # @return [Array<String, Regexp>] - attr_reader :visit_links - - # The URL file extensions to visit. - # - # @return [Array<String, Regexp>] - attr_reader :visit_exts - - # The hosts to ignore. - # - # @return [Array<String, Regexp>] - attr_reader :ignore_hosts - - # The port numbers to ignore. - # - # @return [Array<Integer, Regexp>] - attr_reader :ignore_ports - - # The links to ignore. - # - # @return [Array<String, Regexp>] - attr_reader :ignore_links - - # The URL file extensions to ignore. - # - # @return [Array<String, Regexp>] - attr_reader :ignore_exts - - # - # Initializes the spider command. - # - # @param [Hash{Symbol => Object}] kwargs - # Additional keyword arguments. - # - def initialize(**kwargs) - super(**kwargs) - - @default_headers = {} - @host_headers = {} - - @queue = [] - @history = [] - - @visit_schemes = [] - @visit_hosts = [] - @visit_ports = [] - @visit_links = [] - @visit_exts = [] - - @ignore_hosts = [] - @ignore_ports = [] - @ignore_links = [] - @ignore_exts = [] - end - - # # Runs the `ronin-web spider` command. # def run archive = if options[:archive] Web::Spider::Archive.open(options[:archive]) @@ -644,10 +294,34 @@ agent.every_js_string do |string| print_content string end end + if options[:print_js_url_strings] + agent.every_js_url_string do |url| + print_content url + end + end + + if options[:print_js_path_strings] + agent.every_js_path_string do |path| + print_content path + end + end + + if options[:print_js_absolute_path_strings] + agent.every_js_absolute_path_string do |path| + print_content path + end + end + + if options[:print_js_relative_path_strings] + agent.every_js_relative_path_string do |path| + print_content path + end + end + if options[:print_html_comments] agent.every_html_comment do |comment| print_content comment end end @@ -661,89 +335,9 @@ if options[:print_comments] agent.every_comment do |comment| print_content comment end end - end - - # - # Creates a new web spider agent. - # - # @yield [agent] - # The given block will be given the newly created and configured - # web spider agent. - # - # @yieldparam [Ronin::Web::Spider::Agent] agent - # The newly created web spider agent. - # - # @return [Ronin::Web::Spider::Agent] - # The newly created web spider agent, after the agent has completed - # it's spidering. - # - def new_agent(&block) - if options[:host] - Web::Spider.host(options[:host],**agent_kwargs,&block) - elsif options[:domain] - Web::Spider.domain(options[:domain],**agent_kwargs,&block) - elsif options[:site] - Web::Spider.site(options[:site],**agent_kwargs,&block) - else - print_error "must specify --host, --domain, or --site" - exit(-1) - end - end - - # - # Builds keyword arguments for `Ronin::Web::Spider::Agent#initialize`. - # - # @return [Hash{Symbol => Object}] - # The keyword arguments for `Ronin::Web::Spider::Agent#initialize`. - # - def agent_kwargs - kwargs = {} - - kwargs[:proxy] = options[:proxy] if options[:proxy] - - unless @default_headers.empty? - kwargs[:default_headers] = @default_headers - end - - unless @host_headers.empty? - kwargs[:host_headers] = @host_headers - end - - kwargs[:user_agent] = @user_agent if @user_agent - kwargs[:referer] = options[:referer] if options[:referer] - - kwargs[:delay] = options[:delay] if options[:delay] - kwargs[:limit] = options[:limit] if options[:limit] - kwargs[:max_depth] = options[:max_depth] if options[:max_depth] - - kwargs[:queue] = @queue unless @queue.empty? - kwargs[:history] = @history unless @history.empty? - - if options.has_key?(:strip_fragments) - kwargs[:strip_fragments] = options[:strip_fragments] - end - - if options.has_key?(:strip_query) - kwargs[:strip_query] = options[:strip_query] - end - - kwargs[:schemes] = @visit_schemes unless @visit_schemes.empty? - kwargs[:hosts] = @visit_hosts unless @visit_hosts.empty? - kwargs[:ports] = @visit_ports unless @visit_ports.empty? - kwargs[:links] = @visit_links unless @visit_links.empty? - kwargs[:exts] = @visit_exts unless @visit_exts.empty? - - kwargs[:ignore_hosts] = @ignore_hosts unless @ignore_hosts.empty? - kwargs[:ignore_ports] = @ignore_ports unless @ignore_ports.empty? - kwargs[:ignore_links] = @ignore_links unless @ignore_links.empty? - kwargs[:ignore_exts] = @ignore_exts unless @ignore_exts.empty? - - kwargs[:robots] = options[:robots] if options.has_key?(:robots) - - return kwargs end # # Prints the status of a page. #