# # ronin-web - A collection of useful web helper methods and commands. # # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com) # # ronin-web is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # ronin-web is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ronin-web. If not, see . # require 'ronin/web/cli/command' require 'ronin/web/spider' require 'ronin/web/spider/archive' require 'ronin/web/spider/git_archive' require 'ronin/support/network/http/user_agents' require 'command_kit/colors' require 'command_kit/printing/indent' require 'command_kit/options/verbose' module Ronin module Web class CLI module Commands # # Spiders a website. # # ## Usage # # ronin-web spider [options] {--host HOST | --domain DOMAIN | --site URL} # # ## Options # # -v, --verbose Enables verbose output # --open-timeout SECS Sets the connection open timeout # --read-timeout SECS Sets the read timeout # --ssl-timeout SECS Sets the SSL connection timeout # --continue-timeout SECS Sets the continue timeout # --keep-alive-timeout SECS Sets the connection keep alive timeout # -P, --proxy PROXY Sets the proxy to use. # -H, --header NAME: VALUE Sets a default header # --host-header NAME=VALUE Sets a default header # -u chrome-linux|chrome-macos|chrome-windows|chrome-iphone|chrome-ipad|chrome-android|firefox-linux|firefox-macos|firefox-windows|firefox-iphone|firefox-ipad|firefox-android|safari-macos|safari-iphone|safari-ipad|edge, # --user-agent The User-Agent to use # -U, --user-agent-string STRING The User-Agent string to use # -R, --referer URL Sets the Referer URL # --delay SECS Sets the delay in seconds between each request # -l, --limit COUNT Only spiders up to COUNT pages # -d, --max-depth DEPTH Only spiders up to max depth # --enqueue URL Adds the URL to the queue # --visited URL Marks the URL as previously visited # --strip-fragments Enables/disables stripping the fragment component of every URL # --strip-query Enables/disables stripping the query component of every URL # --visit-host HOST Visit URLs with the matching host name # --visit-hosts-like /REGEX/ Visit URLs with hostnames that match the REGEX # --ignore-host HOST Ignore the host name # --ignore-hosts-like /REGEX/ Ignore the host names matching the REGEX # --visit-port PORT Visit URLs with the matching port number # --visit-ports-like /REGEX/ Visit URLs with port numbers that match the REGEX # --ignore-port PORT Ignore the port number # --ignore-ports-like /REGEX/ Ignore the port numbers matching the REGEXP # --visit-link URL Visit the URL # --visit-links-like /REGEX/ Visit URLs that match the REGEX # --ignore-link URL Ignore the URL # --ignore-links-like /REGEX/ Ignore URLs matching the REGEX # --visit-ext FILE_EXT Visit URLs with the matching file ext # --visit-exts-like /REGEX/ Visit URLs with file exts that match the REGEX # --ignore-ext FILE_EXT Ignore the URLs with the file ext # --ignore-exts-like /REGEX/ Ignore URLs with file exts matching the REGEX # -r, --robots Specifies whether to honor robots.txt # --host HOST Spiders the specific HOST # --domain DOMAIN Spiders the whole domain # --site URL Spiders the website, starting at the URL # --print-status Print the status codes for each URL # --print-headers Print response headers for each URL # --print-header NAME Prints a specific header # --archive DIR Archive every visited page to the DIR # --git-archive DIR Archive every visited page to the git repository # -X, --xpath XPATH Evaluates the XPath on each HTML page # -C, --css-path XPATH Evaluates the CSS-path on each HTML page # -h, --help Print help information # # ## Examples # # ronin-web spider --host scanme.nmap.org # ronin-web spider --domain nmap.org # ronin-web spider --site https://scanme.nmap.org/ # class Spider < Command include CommandKit::Colors include CommandKit::Printing::Indent include CommandKit::Options::Verbose usage '[options] {--host HOST | --domain DOMAIN | --site URL}' option :open_timeout, value: { type: Integer, usage: 'SECS', default: Spidr.open_timeout }, desc: 'Sets the connection open timeout' option :read_timeout, value: { type: Integer, usage: 'SECS', default: Spidr.read_timeout }, desc: 'Sets the read timeout' option :ssl_timeout, value: { type: Integer, usage: 'SECS', default: Spidr.ssl_timeout }, desc: 'Sets the SSL connection timeout' option :continue_timeout, value: { type: Integer, usage: 'SECS', default: Spidr.continue_timeout }, desc: 'Sets the continue timeout' option :keep_alive_timeout, value: { type: Integer, usage: 'SECS', default: Spidr.keep_alive_timeout }, desc: 'Sets the connection keep alive timeout' option :proxy, short: '-P', value: { type: String, usage: 'PROXY' }, desc: 'Sets the proxy to use' option :header, short: '-H', value: { type: /\A[^\s:]+:.*\z/, usage: 'NAME: VALUE' }, desc: 'Sets a default header' do |header| name, value = header.split(/:\s*/,2) @default_headers[name] = value end option :host_header, value: { type: /\A[^\s=]+=[^\s=]+\z/, usage: 'NAME=VALUE' }, desc: 'Sets a default header' do |name_value| name, value = name_value.split('=',2) @host_headers[name] = value end option :user_agent, value: { type: String, usage: 'USER-AGENT' }, desc: 'Sets the User-Agent string' option :user_agent_string, short: '-U', value: { type: String, usage: 'STRING' }, desc: 'The User-Agent string to use' do |ua| @user_agent = ua end option :user_agent, short: '-u', value: { type: Hash[ Support::Network::HTTP::UserAgents::ALIASES.keys.map { |key| [key.to_s.tr('_','-'), key] } ] }, desc: 'The User-Agent to use' do |name| @user_agent = name end option :referer, short: '-R', value: { type: String, usage: 'URL' }, desc: 'Sets the Referer URL' option :delay, short: '-d', value: { type: Numeric, usage: 'SECS' }, desc: 'Sets the delay in seconds between each request' option :limit, short: '-l', value: { type: Integer, usage: 'COUNT' }, desc: 'Only spiders up to COUNT pages' option :max_depth, short: '-d', value: { type: Integer, usage: 'DEPTH', }, desc: 'Only spiders up to max depth' option :enqueue, value: { type: String, usage: 'URL' }, desc: 'Adds the URL to the queue' do |url| @queue << url end option :visited, value: { type: String, usage: 'URL' }, desc: 'Marks the URL as previously visited' do |url| @history << url end option :strip_fragments, desc: 'Enables/disables stripping the fragment component of every URL' option :strip_query, desc: 'Enables/disables stripping the query component of every URL' option :visit_host, value: { type: String, usage: 'HOST' }, desc: 'Visit URLs with the matching host name' do |host| @visit_hosts << host end option :visit_hosts_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Visit URLs with hostnames that match the REGEX' do |regex| @visit_hosts << regex end option :ignore_host, value: { type: String, usage: 'HOST' }, desc: 'Ignore the host name' do |host| @ignore_hosts << host end option :ignore_hosts_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Ignore the host names matching the REGEX' do |regex| @ignore_hosts << regex end option :visit_port, value: { type: Integer, usage: 'PORT' }, desc: 'Visit URLs with the matching port number' do |port| @visit_ports << port end option :visit_ports_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Visit URLs with port numbers that match the REGEX' do |regex| @visit_ports << regex end option :ignore_port, value: { type: Integer, usage: 'PORT' }, desc: 'Ignore the port number' do |port| @ignore_ports << port end option :ignore_ports_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Ignore the port numbers matching the REGEXP' do |regex| @ignore_ports << regex end option :visit_link, value: { type: String, usage: 'URL' }, desc: 'Visit the URL' do |link| @visit_links << link end option :visit_links_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Visit URLs that match the REGEX' do |regex| @visit_links << regex end option :ignore_link, value: { type: String, usage: 'URL' }, desc: 'Ignore the URL' do |link| @ignore_links << link end option :ignore_links_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Ignore URLs matching the REGEX' do |regex| @ignore_links << regex end option :visit_ext, value: { type: String, usage: 'FILE_EXT' }, desc: 'Visit URLs with the matching file ext' do |ext| @visit_exts << ext end option :visit_exts_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Visit URLs with file exts that match the REGEX' do |regex| @visit_exts << regex end option :ignore_ext, value: { type: String, usage: 'FILE_EXT' }, desc: 'Ignore the URLs with the file ext' do |ext| @ignore_exts << ext end option :ignore_exts_like, value: { type: Regexp, usage: '/REGEX/' }, desc: 'Ignore URLs with file exts matching the REGEX' do |regex| @ignore_exts << regex end option :robots, short: '-r', desc: 'Specifies whether to honor robots.txt' option :host, value: { type: String, usage: 'HOST' }, desc: 'Spiders the specific HOST' option :domain, value: { type: String, usage: 'DOMAIN', }, desc: 'Spiders the whole domain' option :site, value: { type: String, usage: 'URL' }, desc: 'Spiders the website, starting at the URL' option :print_verbose, desc: 'Print the status codes for each URL' option :print_headers, desc: 'Print response headers for each URL' option :print_header, value: { type: String, usage: 'NAME' }, desc: 'Prints a specific header' option :archive, value: { type: String, usage: 'DIR' }, desc: 'Archive every visited page to the DIR' option :git_archive, value: { type: String, usage: 'DIR' }, desc: 'Archive every visited page to the git repository' option :xpath, short: '-X', value: { type: String, usage: 'XPATH' }, desc: 'Evaluates the XPath on each HTML page' option :css_path, short: '-C', value: { type: String, usage: 'XPATH' }, desc: 'Evaluates the CSS-path on each HTML page' option :print_hosts, desc: 'Print all discovered hostnames' option :print_certs, desc: 'Print all encountered SSL/TLS certificates' option :save_certs, desc: 'Saves all encountered SSL/TLS certificates' option :print_js_strings, desc: 'Print all JavaScript strings' option :print_html_comments, desc: 'Print HTML comments' option :print_js_comments, desc: 'Print JavaScript comments' option :print_comments, desc: 'Print all HTML and JavaScript comments' description 'Spiders a website' examples [ "--host scanme.nmap.org", "--domain nmap.org", "--site https://scanme.nmap.org/" ] man_page 'ronin-web-spider.1' # The default HTTP headers to send with every request. # # @return [Hash{String => String}] attr_reader :default_headers # The mapping of custom `Host` headers. # # @return [Hash{String => String}] attr_reader :host_headers # The pre-existing queue of URLs to start spidering with. # # @return [Array] attr_reader :queue # The pre-existing of previously visited URLs to start spidering with. # # @return [Array] attr_reader :history # The schemes to visit. # # @return [Array] attr_reader :visit_schemes # The hosts to visit. # # @return [Array] attr_reader :visit_hosts # The port numbers to visit. # # @return [Array] attr_reader :visit_ports # The links to visit. # # @return [Array] attr_reader :visit_links # The URL file extensions to visit. # # @return [Array] attr_reader :visit_exts # The hosts to ignore. # # @return [Array] attr_reader :ignore_hosts # The port numbers to ignore. # # @return [Array] attr_reader :ignore_ports # The links to ignore. # # @return [Array] attr_reader :ignore_links # The URL file extensions to ignore. # # @return [Array] attr_reader :ignore_exts # # Initializes the spider command. # # @param [Hash{Symbol => Object}] kwargs # Additional keyword arguments. # def initialize(**kwargs) super(**kwargs) @default_headers = {} @host_headers = {} @queue = [] @history = [] @visit_schemes = [] @visit_hosts = [] @visit_ports = [] @visit_links = [] @visit_exts = [] @ignore_hosts = [] @ignore_ports = [] @ignore_links = [] @ignore_exts = [] end # # Runs the `ronin-web spider` command. # def run archive = if options[:archive] Web::Spider::Archive.open(options[:archive]) elsif options[:git_archive] Web::Spider::GitArchive.open(options[:git_archive]) end agent = new_agent do |agent| agent.every_page do |page| print_page(page) end agent.every_failed_url do |url| print_verbose "failed to request #{url}" end if options[:print_hosts] agent.every_host do |host| print_verbose "spidering new host #{host}" end end if options[:print_certs] agent.every_cert do |cert| print_verbose "encountered new certificate for #{cert.subject.common_name}" end end if options[:print_js_strings] agent.every_js_string do |string| print_content string end end if options[:print_html_comments] agent.every_html_comment do |comment| print_content comment end end if options[:print_js_comments] agent.every_js_comment do |comment| print_content comment end end if options[:print_comments] agent.every_comment do |comment| print_content comment end end if archive agent.every_ok_page do |page| archive.write(page.url,page.body) end end end if options[:git_archive] archive.commit "Updated #{Time.now}" end if options[:print_hosts] puts puts "Spidered the following hosts:" puts indent do agent.visited_hosts.each do |host| puts host end end end if options[:print_certs] puts puts "Discovered the following certs:" puts agent.collected_certs.each do |cert| puts cert puts end end end # # Creates a new web spider agent. # # @yield [agent] # The given block will be given the newly created and configured # web spider agent. # # @yieldparam [Ronin::Web::Spider::Agent] agent # The newly created web spider agent. # # @return [Ronin::Web::Spider::Agent] # The newly created web spider agent, after the agent has completed # it's spidering. # def new_agent(&block) if options[:host] Web::Spider.host(options[:host],**agent_kwargs,&block) elsif options[:domain] Web::Spider.domain(options[:domain],**agent_kwargs,&block) elsif options[:site] Web::Spider.site(options[:site],**agent_kwargs,&block) else print_error "must specify --host, --domain, or --site" exit(-1) end end # # Builds keyword arguments for `Ronin::Web::Spider::Agent#initialize`. # # @return [Hash{Symbol => Object}] # The keyword arguments for `Ronin::Web::Spider::Agent#initialize`. # def agent_kwargs kwargs = {} kwargs[:proxy] = options[:proxy] if options[:proxy] unless @default_headers.empty? kwargs[:default_headers] = @default_headers end unless @host_headers.empty? kwargs[:host_headers] = @host_headers end kwargs[:user_agent] = @user_agent if @user_agent kwargs[:referer] = options[:referer] if options[:referer] kwargs[:delay] = options[:delay] if options[:delay] kwargs[:limit] = options[:limit] if options[:limit] kwargs[:max_depth] = options[:max_depth] if options[:max_depth] kwargs[:queue] = @queue unless @queue.empty? kwargs[:history] = @history unless @history.empty? if options.has_key?(:strip_fragments) kwargs[:strip_fragments] = options[:strip_fragments] end if options.has_key?(:strip_query) kwargs[:strip_query] = options[:strip_query] end kwargs[:schemes] = @visit_schemes unless @visit_schemes.empty? kwargs[:hosts] = @visit_hosts unless @visit_hosts.empty? kwargs[:ports] = @visit_ports unless @visit_ports.empty? kwargs[:links] = @visit_links unless @visit_links.empty? kwargs[:exts] = @visit_exts unless @visit_exts.empty? kwargs[:ignore_hosts] = @ignore_hosts unless @ignore_hosts.empty? kwargs[:ignore_ports] = @ignore_ports unless @ignore_ports.empty? kwargs[:ignore_links] = @ignore_links unless @ignore_links.empty? kwargs[:ignore_exts] = @ignore_exts unless @ignore_exts.empty? kwargs[:robots] = options[:robots] if options.has_key?(:robots) return kwargs end # # Prints the status of a page. # # @param [Spidr::Page] page # A spidered page. # def print_verbose(page) if page.code < 300 print "#{colors.bright_green(page.code)} " elsif page.code < 400 print "#{colors.bright_yellow(page.code)} " elsif page.code < 500 print "#{colors.bright_red(page.code)} " else print "#{colors.bold(colors.bright_red(page.code))} " end end # # Prints the URL for a page. # # @param [Spidr::Page] page # A spidered page. # def print_url(page) if page.code < 300 puts "#{colors.green(page.url)} " elsif page.code < 400 puts "#{colors.yellow(page.url)} " elsif page.code < 500 puts "#{colors.red(page.url)} " else puts "#{colors.bold(colors.red(page.url))} " end end # # Prints a page. # # @param [Spidr::Page] page # A spidered page. # def print_page(page) print_verbose(page) if options[:print_verbose] print_url(page) if options[:print_headers] print_headers(page) elsif options[:print_header] if (header = page.response[options[:print_header]]) print_content header end end print_query(page) if (options[:xpath] || options[:css_path]) end # # Prints the headers of a page. # # @param [Spidr::Page] page # A spidered page. # def print_headers(page) page.response.each_capitalized do |name,value| print_content "#{name}: #{value}" end end # # Prints the XPath or CSS-path query result for the page. # # @param [Spidr::Page] page # A spidered page. # def print_query(page) if page.html? if options[:xpath] print_content page.doc.xpath(options[:xpath]) elsif options[:css_path] print_content page.doc.css(options[:css_path]) end end end # # Prints an information message. # # @param [String] message # def print_verbose(message) if verbose? puts colors.yellow("* #{message}") end end # # Print content from a page. # # @param [#to_s] content # The content to print. # def print_content(content) content.to_s.each_line do |line| puts " #{line}" end end end end end end end