lib/ronin/web/spider.rb in ronin-web-spider-0.1.0 vs lib/ronin/web/spider.rb in ronin-web-spider-0.1.1

- old
+ new

@@ -1,5 +1,6 @@ +# frozen_string_literal: true # # ronin-web-spider - A collection of common web spidering routines. # # Copyright (c) 2006-2023 Hal Brodigan (postmodern.mod3 at gmail.com) # @@ -28,281 +29,281 @@ # [spidr]: https://github.com/postmodern/spidr#readme # # ## Examples # # Spider a host: - # + # # ```ruby # require 'ronin/web/spider' - # + # # Ronin::Web::Spider.start_at('http://tenderlovemaking.com/') do |agent| # # ... # end # ``` - # + # # Spider a host: - # + # # ```ruby # Ronin::Web::Spider.host('solnic.eu') do |agent| # # ... # end # ``` - # + # # Spider a domain (and any sub-domains): - # + # # ```ruby # Ronin::Web::Spider.domain('ruby-lang.org') do |agent| # # ... # end # ``` - # + # # Spider a site: - # + # # ```ruby # Ronin::Web::Spider.site('http://www.rubyflow.com/') do |agent| # # ... # end # ``` - # + # # Spider multiple hosts: - # + # # ```ruby # Ronin::Web::Spider.start_at('http://company.com/', hosts: ['company.com', /host[\d]+\.company\.com/]) do |agent| # # ... # end # ``` - # + # # Do not spider certain links: - # + # # ```ruby # Ronin::Web::Spider.site('http://company.com/', ignore_links: [%{^/blog/}]) do |agent| # # ... # end # ``` - # + # # Do not spider links on certain ports: - # + # # ```ruby # Ronin::Web::Spider.site('http://company.com/', ignore_ports: [8000, 8010, 8080]) do |agent| # # ... # end # ``` - # + # # Do not spider links blacklisted in robots.txt: - # + # # ```ruby # Ronin::Web::Spider.site('http://company.com/', robots: true) do |agent| # # ... # end # ``` - # + # # Print out visited URLs: - # + # # ```ruby # Ronin::Web::Spider.site('http://www.rubyinside.com/') do |spider| # spider.every_url { |url| puts url } # end # ``` - # + # # Build a URL map of a site: - # + # # ```ruby # url_map = Hash.new { |hash,key| hash[key] = [] } - # + # # Ronin::Web::Spider.site('http://intranet.com/') do |spider| # spider.every_link do |origin,dest| # url_map[dest] << origin # end # end # ``` - # + # # Print out the URLs that could not be requested: - # + # # ```ruby # Ronin::Web::Spider.site('http://company.com/') do |spider| # spider.every_failed_url { |url| puts url } # end # ``` - # + # # Finds all pages which have broken links: - # + # # ```ruby # url_map = Hash.new { |hash,key| hash[key] = [] } - # + # # spider = Ronin::Web::Spider.site('http://intranet.com/') do |spider| # spider.every_link do |origin,dest| # url_map[dest] << origin # end # end - # + # # spider.failures.each do |url| # puts "Broken link #{url} found in:" - # + # # url_map[url].each { |page| puts " #{page}" } # end # ``` - # + # # Search HTML and XML pages: - # + # # ```ruby # Ronin::Web::Spider.site('http://company.com/') do |spider| # spider.every_page do |page| # puts ">>> #{page.url}" - # + # # page.search('//meta').each do |meta| # name = (meta.attributes['name'] || meta.attributes['http-equiv']) # value = meta.attributes['content'] - # + # # puts " #{name} = #{value}" # end # end # end # ``` - # + # # Print out the titles from every page: - # + # # ```ruby # Ronin::Web::Spider.site('https://www.ruby-lang.org/') do |spider| # spider.every_html_page do |page| # puts page.title # end # end # ``` - # + # # Print out every HTTP redirect: - # + # # ```ruby # Ronin::Web::Spider.host('company.com') do |spider| # spider.every_redirect_page do |page| # puts "#{page.url} -> #{page.headers['Location']}" # end # end # ``` - # + # # Find what kinds of web servers a host is using, by accessing the headers: - # + # # ```ruby # servers = Set[] - # + # # Ronin::Web::Spider.host('company.com') do |spider| # spider.all_headers do |headers| # servers << headers['server'] # end # end # ``` - # + # # Pause the spider on a forbidden page: - # + # # ```ruby # Ronin::Web::Spider.host('company.com') do |spider| # spider.every_forbidden_page do |page| # spider.pause! # end # end # ``` - # + # # Skip the processing of a page: - # + # # ```ruby # Ronin::Web::Spider.host('company.com') do |spider| # spider.every_missing_page do |page| # spider.skip_page! # end # end # ``` - # + # # Skip the processing of links: - # + # # ```ruby # Ronin::Web::Spider.host('company.com') do |spider| # spider.every_url do |url| # if url.path.split('/').find { |dir| dir.to_i > 1000 } # spider.skip_link! # end # end # end # ``` - # + # # Detect when a new host name is spidered: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_host do |host| # puts "Spidring #{host} ..." # end # end # ``` - # + # # Detect when a new SSL/TLS certificate is encountered: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_cert do |cert| # puts "Discovered new cert for #{cert.subject.command_name}, #{cert.subject_alt_name}" # end # end # ``` - # + # # Print the MD5 checksum of every `favicon.ico` file: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_favicon do |page| # puts "#{page.url}: #{page.body.md5}" # end # end # ``` - # + # # Print every HTML comment: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_html_comment do |comment| # puts comment # end # end # ``` - # + # # Print all JavaScript source code: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_javascript do |js| # puts js # end # end # ``` - # + # # Print every JavaScript string literal: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_javascript_string do |str| # puts str # end # end # ``` - # + # # Print every JavaScript comment: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_javascript_comment do |comment| # puts comment # end # end # ``` - # + # # Print every HTML and JavaScript comment: - # + # # ```ruby # Ronin::Web::Spider.domain('example.com') do |spider| # spider.every_comment do |comment| # puts comment # end # end # ``` - # + # module Spider # # Creates a new agent and begin spidering at the given URL. # # @param [URI::HTTP, String] url