# frozen_string_literal: true # # ronin-web-spider - A collection of common web spidering routines. # # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com) # # ronin-web-spider is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # ronin-web-spider is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with ronin-web-spider. If not, see . # require 'spidr/agent' require 'ronin/support/network/http' require 'ronin/support/crypto/cert' require 'ronin/support/text/patterns/source_code' require 'ronin/support/encoding/js' module Ronin module Web module Spider # # Extends [Spidr::Agent](https://rubydoc.info/gems/spidr/Agent). # class Agent < Spidr::Agent # # Creates a new Spider object. # # @param [Spidr::Proxy, Addressable::URI, URI::HTTP, Hash, String, nil] proxy # The proxy to use while spidering. # # @param [String, nil] user_agent # The User-Agent string to send. # # @param [Hash{Symbol => Object}] kwargs # Additional keyword arguments for `Spidr::Agent#initialize`. # # @option kwargs [String, nil] :referer # The referer URL to send. # # @option kwargs [Integer] :delay (0) # Duration in seconds to pause between spidering each link. # # @option kwargs [Array] :schemes (['http', 'https']) # The list of acceptable URI schemes to visit. # The `https` scheme will be ignored if `net/https` cannot be # loaded. # # @option kwargs [String, nil] :host # The host-name to visit. # # @option kwargs [Array] :hosts # The patterns which match the host-names to visit. # # @option kwargs [Array] :ignore_hosts # The patterns which match the host-names to not visit. # # @option kwargs [Array] :ports # The patterns which match the ports to visit. # # @option kwargs [Array] :ignore_ports # The patterns which match the ports to not visit. # # @option kwargs [Array] :links # The patterns which match the links to visit. # # @option kwargs [Array] :ignore_links # The patterns which match the links to not visit. # # @option kwargs [Array] :exts # The patterns which match the URI path extensions to visit. # # @option kwargs [Array] :ignore_exts # The patterns which match the URI path extensions to not visit. # # @yield [agent] # If a block is given, it will be passed the newly created web spider # agent. # # @yieldparam [Agent] agent # The newly created web spider agent. # # @see https://rubydoc.info/gems/spidr/Spidr/Agent#initialize-instance_method # # @api public # def initialize(proxy: Support::Network::HTTP.proxy, user_agent: Support::Network::HTTP.user_agent, **kwargs, &block) proxy = case proxy when Addressable::URI Spidr::Proxy.new( host: proxy.host, port: proxy.port, user: proxy.user, password: proxy.password ) else proxy end user_agent = case user_agent when Symbol Support::Network::HTTP::UserAgents[user_agent] else user_agent end super(proxy: proxy, user_agent: user_agent, **kwargs,&block) end # The visited host names. # # @return [Set, nil] # # @api public attr_reader :visited_hosts # # Passes every unique host name that the agent visits to the given # block and populates {#visited_hosts}. # # @yield [host] # # @yieldparam [String] host # # @example # spider.every_host do |host| # puts "Spidring #{host} ..." # end # # @api public # def every_host @visited_hosts ||= Set.new every_page do |page| host = page.url.host if @visited_hosts.add?(host) yield host end end end # All certificates encountered while spidering. # # @return [Array] # # @api public attr_reader :collected_certs # # Passes every unique TLS certificate to the given block and populates # {#collected_certs}. # # @yield [cert] # # @yieldparam [Ronin::Support::Crypto::Cert] # # @example # spider.every_cert do |cert| # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}" # end # # @api public # def every_cert @collected_certs ||= [] serials = Set.new every_page do |page| if page.url.scheme == 'https' cert = sessions[page.url].peer_cert if serials.add?(cert.serial) cert = Support::Crypto::Cert(cert) @collected_certs << cert yield cert end end end end # # Pass every favicon from every page to the given block. # # @yield [favicon] # The given block will be passed every encountered `.ico` file. # # @yieldparam [Spidr::Page] favicon # An encountered `.ico` file. # # @example # spider.every_favicon do |page| # # ... # end # # @see https://rubydoc.info/gems/spidr/Spidr/Page # # @api public # def every_favicon every_page do |page| yield page if page.icon? end end # # Passes every non-empty HTML comment to the given block. # # @yield [comment] # The given block will be pass every HTML comment. # # @yieldparam [String] comment # The HTML comment inner text, with leading and trailing whitespace # stripped. # # @example # spider.every_html_comment do |comment| # puts comment # end # # @api public # def every_html_comment every_html_page do |page| next unless page.doc page.doc.xpath('//comment()').each do |comment| comment_text = comment.inner_text.strip unless comment_text.empty? yield comment_text end end end end # # Passes every piece of JavaScript to the given block. # # @yield [js] # The given block will be passed every piece of JavaScript source. # # @yieldparam [String] js # The JavaScript source code. # # @example # spider.every_javascript do |js| # puts js # end # # @api public # def every_javascript # yield inner text of every `