spider.rb in ronin-web-2.0.0.rc1

- old
+ new

@@ -1,10 +1,10 @@
 # frozen_string_literal: true
 #
 # ronin-web - A collection of useful web helper methods and commands.
 #
-# Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com)
+# Copyright (c) 2006-2024 Hal Brodigan (postmodern.mod3 at gmail.com)
 #
 # ronin-web is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
@@ -17,11 +17,11 @@
 # You should have received a copy of the GNU General Public License
 # along with ronin-web.  If not, see <https://www.gnu.org/licenses/>.
 #
 
 require 'ronin/web/cli/command'
-require 'ronin/web/spider'
+require 'ronin/web/cli/spider_options'
 require 'ronin/web/spider/archive'
 require 'ronin/web/spider/git_archive'
 require 'ronin/support/network/http/user_agents'
 
 require 'command_kit/colors'
@@ -39,30 +39,37 @@
         #
         #     ronin-web spider [options] {--host HOST | --domain DOMAIN | --site URL}
         #
         # ## Options
         #
-        #     -v, --verbose                    Enables verbose output
+        #         --host HOST                  Spiders the specific HOST
+        #         --domain DOMAIN              Spiders the whole domain
+        #         --site URL                   Spiders the website, starting at the URL
         #         --open-timeout SECS          Sets the connection open timeout
         #         --read-timeout SECS          Sets the read timeout
         #         --ssl-timeout SECS           Sets the SSL connection timeout
         #         --continue-timeout SECS      Sets the continue timeout
         #         --keep-alive-timeout SECS    Sets the connection keep alive timeout
-        #     -P, --proxy PROXY                Sets the proxy to use.
+        #     -P, --proxy PROXY                Sets the proxy to use
         #     -H, --header NAME: VALUE         Sets a default header
         #         --host-header NAME=VALUE     Sets a default header
+        #     -U, --user-agent-string STRING   The User-Agent string to use
         #     -u chrome-linux|chrome-macos|chrome-windows|chrome-iphone|chrome-ipad|chrome-android|firefox-linux|firefox-macos|firefox-windows|firefox-iphone|firefox-ipad|firefox-android|safari-macos|safari-iphone|safari-ipad|edge,
         #         --user-agent                 The User-Agent to use
-        #     -U, --user-agent-string STRING   The User-Agent string to use
         #     -R, --referer URL                Sets the Referer URL
         #         --delay SECS                 Sets the delay in seconds between each request
         #     -l, --limit COUNT                Only spiders up to COUNT pages
         #     -d, --max-depth DEPTH            Only spiders up to max depth
         #         --enqueue URL                Adds the URL to the queue
         #         --visited URL                Marks the URL as previously visited
         #         --strip-fragments            Enables/disables stripping the fragment component of every URL
         #         --strip-query                Enables/disables stripping the query component of every URL
+        #         --visit-scheme SCHEME        Visit URLs with the URI scheme
+        #         --visit-schemes-like /REGEX/ Visit URLs with URI schemes that match the REGEX
+        #         --ignore-scheme SCHEME       Ignore the URLs with the URI scheme
+        #         --ignore-schemes-like /REGEX/
+        #                                      Ignore the URLs with URI schemes matching the REGEX
         #         --visit-host HOST            Visit URLs with the matching host name
         #         --visit-hosts-like /REGEX/   Visit URLs with hostnames that match the REGEX
         #         --ignore-host HOST           Ignore the host name
         #         --ignore-hosts-like /REGEX/  Ignore the host names matching the REGEX
         #         --visit-port PORT            Visit URLs with the matching port number
@@ -76,324 +83,51 @@
         #         --visit-ext FILE_EXT         Visit URLs with the matching file ext
         #         --visit-exts-like /REGEX/    Visit URLs with file exts that match the REGEX
         #         --ignore-ext FILE_EXT        Ignore the URLs with the file ext
         #         --ignore-exts-like /REGEX/   Ignore URLs with file exts matching the REGEX
         #     -r, --robots                     Specifies whether to honor robots.txt
-        #         --host HOST                  Spiders the specific HOST
-        #         --domain DOMAIN              Spiders the whole domain
-        #         --site URL                   Spiders the website, starting at the URL
-        #         --print-status               Print the status codes for each URL
+        #     -v, --verbose                    Enables verbose output
+        #         --print-stauts               Print the status codes for each URL
         #         --print-headers              Print response headers for each URL
         #         --print-header NAME          Prints a specific header
         #         --history FILE               The history file
         #         --archive DIR                Archive every visited page to the DIR
         #         --git-archive DIR            Archive every visited page to the git repository
         #     -X, --xpath XPATH                Evaluates the XPath on each HTML page
         #     -C, --css-path XPATH             Evaluates the CSS-path on each HTML page
+        #         --print-hosts                Print all discovered hostnames
+        #         --print-certs                Print all encountered SSL/TLS certificates
+        #         --save-certs                 Saves all encountered SSL/TLS certificates
+        #         --print-js-strings           Print all JavaScript strings
+        #         --print-js-url-strings       Print URL strings found in JavaScript
+        #         --print-js-path-strings      Print path strings found in JavaScript
+        #         --print-js-absolute-path-strings
+        #                                      Only print absolute path strings found in JavaScript
+        #         --print-js-relative-path-strings
+        #                                      Only print relative path strings found in JavaScript
+        #         --print-html-comments        Print HTML comments
+        #         --print-js-comments          Print JavaScript comments
+        #         --print-comments             Print all HTML and JavaScript comments
         #     -h, --help                       Print help information
         #
         # ## Examples
         #
         #     ronin-web spider --host scanme.nmap.org
         #     ronin-web spider --domain nmap.org
         #     ronin-web spider --site https://scanme.nmap.org/
         #
         class Spider < Command
 
+          include SpiderOptions
           include CommandKit::Colors
           include CommandKit::Printing::Indent
           include CommandKit::Options::Verbose
 
           usage '[options] {--host HOST | --domain DOMAIN | --site URL}'
 
-          option :open_timeout, value: {
-                                  type: Integer,
-                                  usage: 'SECS',
-                                  default: Spidr.open_timeout
-                                },
-                                desc: 'Sets the connection open timeout'
+          option :print_stauts, desc: 'Print the status codes for each URL'
 
-          option :read_timeout, value: {
-                                  type: Integer,
-                                  usage: 'SECS',
-                                  default: Spidr.read_timeout
-                                },
-                                desc: 'Sets the read timeout'
-
-          option :ssl_timeout, value: {
-                                 type: Integer,
-                                 usage: 'SECS',
-                                 default: Spidr.ssl_timeout
-                               },
-                               desc: 'Sets the SSL connection timeout'
-
-          option :continue_timeout, value: {
-                                      type:    Integer,
-                                      usage:   'SECS',
-                                      default: Spidr.continue_timeout
-                                    },
-                                    desc: 'Sets the continue timeout'
-
-          option :keep_alive_timeout, value: {
-                                        type:    Integer,
-                                        usage:   'SECS',
-                                        default: Spidr.keep_alive_timeout
-                                      },
-                                      desc: 'Sets the connection keep alive timeout'
-
-          option :proxy, short: '-P',
-                         value: {
-                           type:  String,
-                           usage: 'PROXY'
-                         },
-                         desc: 'Sets the proxy to use'
-
-          option :header, short: '-H',
-                          value: {
-                            type:  /\A[^\s:]+:.*\z/,
-                            usage: 'NAME: VALUE'
-                          },
-                          desc: 'Sets a default header' do |header|
-                            name, value = header.split(/:\s*/,2)
-
-                            @default_headers[name] = value
-                          end
-
-          option :host_header, value: {
-                                 type: /\A[^\s=]+=[^\s=]+\z/,
-                                 usage: 'NAME=VALUE'
-                               },
-                               desc: 'Sets a default header' do |name_value|
-                                 name, value = name_value.split('=',2)
-
-                                 @host_headers[name] = value
-                               end
-
-          option :user_agent, value: {
-                                type:  String,
-                                usage: 'USER-AGENT'
-                              },
-                              desc: 'Sets the User-Agent string'
-
-          option :user_agent_string, short: '-U',
-                                     value: {
-                                       type:  String,
-                                       usage: 'STRING'
-                                     },
-                                     desc: 'The User-Agent string to use' do |ua|
-                                       @user_agent = ua
-                                     end
-
-          option :user_agent, short: '-u',
-                              value: {
-                                type: Support::Network::HTTP::UserAgents::ALIASES.transform_keys { |key|
-                                  key.to_s.tr('_','-')
-                                }
-                              },
-                              desc: 'The User-Agent to use' do |name|
-                                @user_agent = name
-                              end
-
-          option :referer, short: '-R',
-                           value: {
-                             type:  String,
-                             usage: 'URL'
-                           },
-                           desc: 'Sets the Referer URL'
-
-          option :delay, short: '-d',
-                         value: {
-                           type:  Numeric,
-                           usage: 'SECS'
-                         },
-                         desc: 'Sets the delay in seconds between each request'
-
-          option :limit, short: '-l',
-                         value: {
-                           type:  Integer,
-                           usage: 'COUNT'
-                         },
-                         desc: 'Only spiders up to COUNT pages'
-
-          option :max_depth, short: '-d',
-                             value: {
-                               type:  Integer,
-                               usage: 'DEPTH'
-                             },
-                             desc: 'Only spiders up to max depth'
-
-          option :enqueue, value: {
-                             type:  String,
-                             usage: 'URL'
-                           },
-                           desc: 'Adds the URL to the queue' do |url|
-                             @queue << url
-                           end
-
-          option :visited, value: {
-                             type:  String,
-                             usage: 'URL'
-                           },
-                           desc: 'Marks the URL as previously visited' do |url|
-                             @history << url
-                           end
-
-          option :strip_fragments, desc: 'Enables/disables stripping the fragment component of every URL'
-
-          option :strip_query, desc: 'Enables/disables stripping the query component of every URL'
-
-          option :visit_host, value: {
-                                type:  String,
-                                usage: 'HOST'
-                              },
-                              desc: 'Visit URLs with the matching host name' do |host|
-                                @visit_hosts << host
-                              end
-
-          option :visit_hosts_like, value: {
-                                      type:  Regexp,
-                                      usage: '/REGEX/'
-                                    },
-                                    desc: 'Visit URLs with hostnames that match the REGEX' do |regex|
-                                      @visit_hosts << regex
-                                    end
-
-          option :ignore_host, value: {
-                                 type:  String,
-                                 usage: 'HOST'
-                               },
-                               desc: 'Ignore the host name' do |host|
-                                 @ignore_hosts << host
-                               end
-
-          option :ignore_hosts_like, value: {
-                                       type:  Regexp,
-                                       usage: '/REGEX/'
-                                     },
-                                     desc: 'Ignore the host names matching the REGEX' do |regex|
-                                       @ignore_hosts << regex
-                                     end
-
-          option :visit_port, value: {
-                                type:  Integer,
-                                usage: 'PORT'
-                              },
-                              desc: 'Visit URLs with the matching port number' do |port|
-                                @visit_ports << port
-                              end
-
-          option :visit_ports_like, value: {
-                                      type:  Regexp,
-                                      usage: '/REGEX/'
-                                    },
-                                    desc: 'Visit URLs with port numbers that match the REGEX' do |regex|
-                                      @visit_ports << regex
-                                    end
-
-          option :ignore_port, value: {
-                                 type:  Integer,
-                                 usage: 'PORT'
-                               },
-                               desc: 'Ignore the port number' do |port|
-                                 @ignore_ports << port
-                               end
-
-          option :ignore_ports_like, value: {
-                                       type:  Regexp,
-                                       usage: '/REGEX/'
-                                     },
-                                     desc: 'Ignore the port numbers matching the REGEXP' do |regex|
-                                       @ignore_ports << regex
-                                     end
-
-          option :visit_link, value: {
-                                type:  String,
-                                usage: 'URL'
-                              },
-                              desc: 'Visit the URL' do |link|
-                                @visit_links << link
-                              end
-
-          option :visit_links_like, value: {
-                                      type:  Regexp,
-                                      usage: '/REGEX/'
-                                    },
-                                    desc: 'Visit URLs that match the REGEX' do |regex|
-                                      @visit_links << regex
-                                    end
-
-          option :ignore_link, value: {
-                                 type:  String,
-                                 usage: 'URL'
-                               },
-                               desc: 'Ignore the URL' do |link|
-                                 @ignore_links << link
-                               end
-
-          option :ignore_links_like, value: {
-                                       type:  Regexp,
-                                       usage: '/REGEX/'
-                                     },
-                                     desc: 'Ignore URLs matching the REGEX' do |regex|
-                                       @ignore_links << regex
-                                     end
-
-          option :visit_ext, value: {
-                               type:  String,
-                               usage: 'FILE_EXT'
-                             },
-                             desc: 'Visit URLs with the matching file ext' do |ext|
-                               @visit_exts << ext
-                             end
-
-          option :visit_exts_like, value: {
-                                     type:  Regexp,
-                                     usage: '/REGEX/'
-                                   },
-                                   desc: 'Visit URLs with file exts that match the REGEX' do |regex|
-                                     @visit_exts << regex
-                                   end
-
-          option :ignore_ext, value: {
-                                type:  String,
-                                usage: 'FILE_EXT'
-                              },
-                              desc: 'Ignore the URLs with the file ext' do |ext|
-                                @ignore_exts << ext
-                              end
-
-          option :ignore_exts_like, value: {
-                                      type:  Regexp,
-                                      usage: '/REGEX/'
-                                    },
-                                    desc: 'Ignore URLs with file exts matching the REGEX' do |regex|
-                                      @ignore_exts << regex
-                                    end
-
-          option :robots, short: '-r',
-                          desc:  'Specifies whether to honor robots.txt'
-
-          option :host, value: {
-                          type:  String,
-                          usage: 'HOST'
-                        },
-                        desc: 'Spiders the specific HOST'
-
-          option :domain, value: {
-                            type:  String,
-                            usage: 'DOMAIN'
-                          },
-                          desc: 'Spiders the whole domain'
-
-          option :site, value: {
-                          type:  String,
-                          usage: 'URL'
-                        },
-                        desc: 'Spiders the website, starting at the URL'
-
-          option :print_status, desc: 'Print the status codes for each URL'
-
           option :print_headers, desc: 'Print response headers for each URL'
 
           option :print_header, value: {
                                   type:  String,
                                   usage: 'NAME'
@@ -438,10 +172,18 @@
 
           option :save_certs, desc: 'Saves all encountered SSL/TLS certificates'
 
           option :print_js_strings, desc: 'Print all JavaScript strings'
 
+          option :print_js_url_strings, desc: 'Print URL strings found in JavaScript'
+
+          option :print_js_path_strings, desc: 'Print path strings found in JavaScript'
+
+          option :print_js_absolute_path_strings, desc: 'Only print absolute path strings found in JavaScript'
+
+          option :print_js_relative_path_strings, desc: 'Only print relative path strings found in JavaScript'
+
           option :print_html_comments, desc: 'Print HTML comments'
 
           option :print_js_comments, desc: 'Print JavaScript comments'
 
           option :print_comments, desc: 'Print all HTML and JavaScript comments'
@@ -454,103 +196,11 @@
             "--site https://scanme.nmap.org/"
           ]
 
           man_page 'ronin-web-spider.1'
 
-          # The default HTTP headers to send with every request.
           #
-          # @return [Hash{String => String}]
-          attr_reader :default_headers
-
-          # The mapping of custom `Host` headers.
-          #
-          # @return [Hash{String => String}]
-          attr_reader :host_headers
-
-          # The pre-existing queue of URLs to start spidering with.
-          #
-          # @return [Array<String>]
-          attr_reader :queue
-
-          # The pre-existing of previously visited URLs to start spidering with.
-          #
-          # @return [Array<String>]
-          attr_reader :history
-
-          # The schemes to visit.
-          #
-          # @return [Array<String>]
-          attr_reader :visit_schemes
-
-          # The hosts to visit.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :visit_hosts
-
-          # The port numbers to visit.
-          #
-          # @return [Array<Integer, Regexp>]
-          attr_reader :visit_ports
-
-          # The links to visit.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :visit_links
-
-          # The URL file extensions to visit.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :visit_exts
-
-          # The hosts to ignore.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :ignore_hosts
-
-          # The port numbers to ignore.
-          #
-          # @return [Array<Integer, Regexp>]
-          attr_reader :ignore_ports
-
-          # The links to ignore.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :ignore_links
-
-          # The URL file extensions to ignore.
-          #
-          # @return [Array<String, Regexp>]
-          attr_reader :ignore_exts
-
-          #
-          # Initializes the spider command.
-          #
-          # @param [Hash{Symbol => Object}] kwargs
-          #   Additional keyword arguments.
-          #
-          def initialize(**kwargs)
-            super(**kwargs)
-
-            @default_headers = {}
-            @host_headers    = {}
-
-            @queue   = []
-            @history = []
-
-            @visit_schemes = []
-            @visit_hosts   = []
-            @visit_ports   = []
-            @visit_links   = []
-            @visit_exts    = []
-
-            @ignore_hosts = []
-            @ignore_ports = []
-            @ignore_links = []
-            @ignore_exts  = []
-          end
-
-          #
           # Runs the `ronin-web spider` command.
           #
           def run
             archive = if options[:archive]
                         Web::Spider::Archive.open(options[:archive])
@@ -644,10 +294,34 @@
               agent.every_js_string do |string|
                 print_content string
               end
             end
 
+            if options[:print_js_url_strings]
+              agent.every_js_url_string do |url|
+                print_content url
+              end
+            end
+
+            if options[:print_js_path_strings]
+              agent.every_js_path_string do |path|
+                print_content path
+              end
+            end
+
+            if options[:print_js_absolute_path_strings]
+              agent.every_js_absolute_path_string do |path|
+                print_content path
+              end
+            end
+
+            if options[:print_js_relative_path_strings]
+              agent.every_js_relative_path_string do |path|
+                print_content path
+              end
+            end
+
             if options[:print_html_comments]
               agent.every_html_comment do |comment|
                 print_content comment
               end
             end
@@ -661,89 +335,9 @@
             if options[:print_comments]
               agent.every_comment do |comment|
                 print_content comment
               end
             end
-          end
-
-          #
-          # Creates a new web spider agent.
-          #
-          # @yield [agent]
-          #   The given block will be given the newly created and configured
-          #   web spider agent.
-          #
-          # @yieldparam [Ronin::Web::Spider::Agent] agent
-          #   The newly created web spider agent.
-          #
-          # @return [Ronin::Web::Spider::Agent]
-          #   The newly created web spider agent, after the agent has completed
-          #   it's spidering.
-          #
-          def new_agent(&block)
-            if options[:host]
-              Web::Spider.host(options[:host],**agent_kwargs,&block)
-            elsif options[:domain]
-              Web::Spider.domain(options[:domain],**agent_kwargs,&block)
-            elsif options[:site]
-              Web::Spider.site(options[:site],**agent_kwargs,&block)
-            else
-              print_error "must specify --host, --domain, or --site"
-              exit(-1)
-            end
-          end
-
-          #
-          # Builds keyword arguments for `Ronin::Web::Spider::Agent#initialize`.
-          #
-          # @return [Hash{Symbol => Object}]
-          #   The keyword arguments for `Ronin::Web::Spider::Agent#initialize`.
-          #
-          def agent_kwargs
-            kwargs = {}
-
-            kwargs[:proxy] = options[:proxy] if options[:proxy]
-
-            unless @default_headers.empty?
-              kwargs[:default_headers] = @default_headers
-            end
-
-            unless @host_headers.empty?
-              kwargs[:host_headers] = @host_headers
-            end
-
-            kwargs[:user_agent] = @user_agent       if @user_agent
-            kwargs[:referer]    = options[:referer] if options[:referer]
-
-            kwargs[:delay]     = options[:delay]     if options[:delay]
-            kwargs[:limit]     = options[:limit]     if options[:limit]
-            kwargs[:max_depth] = options[:max_depth] if options[:max_depth]
-
-            kwargs[:queue]   = @queue   unless @queue.empty?
-            kwargs[:history] = @history unless @history.empty?
-
-            if options.has_key?(:strip_fragments)
-              kwargs[:strip_fragments] = options[:strip_fragments]
-            end
-
-            if options.has_key?(:strip_query)
-              kwargs[:strip_query] = options[:strip_query]
-            end
-
-            kwargs[:schemes] = @visit_schemes unless @visit_schemes.empty?
-            kwargs[:hosts]   = @visit_hosts   unless @visit_hosts.empty?
-            kwargs[:ports]   = @visit_ports   unless @visit_ports.empty?
-            kwargs[:links]   = @visit_links   unless @visit_links.empty?
-            kwargs[:exts]    = @visit_exts    unless @visit_exts.empty?
-
-            kwargs[:ignore_hosts] = @ignore_hosts unless @ignore_hosts.empty?
-            kwargs[:ignore_ports] = @ignore_ports unless @ignore_ports.empty?
-            kwargs[:ignore_links] = @ignore_links unless @ignore_links.empty?
-            kwargs[:ignore_exts]  = @ignore_exts  unless @ignore_exts.empty?
-
-            kwargs[:robots] = options[:robots] if options.has_key?(:robots)
-
-            return kwargs
           end
 
           #
           # Prints the status of a page.
           #