# frozen_string_literal: true require_relative 'reporter' require 'thread/pool' require 'set' module BrokenLinkFinder DEFAULT_MAX_THREADS = 100 # Alias for BrokenLinkFinder::Finder.new. def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS) Finder.new(sort: sort, max_threads: max_threads) end class Finder attr_reader :sort, :broken_links, :ignored_links, :total_links_crawled, :max_threads # Creates a new Finder instance. def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS) raise "Sort by either :page or :link, not #{sort}" \ unless %i[page link].include?(sort) @sort = sort @max_threads = max_threads @lock = Mutex.new @crawler = Wgit::Crawler.new clear_links end # Clear/empty the link collection Hashes. def clear_links @broken_links = {} @ignored_links = {} @total_links_crawled = 0 @all_broken_links = Set.new @all_intact_links = Set.new end # Finds broken links within a single page and appends them to the # @broken_links array. Returns true if at least one broken link was found. # Access the broken links with Finder#broken_links. def crawl_url(url) clear_links url = url.to_url doc = @crawler.crawl_url(url) # Ensure the given page url is valid. raise "Invalid or broken URL: #{url}" unless doc # Get all page links and determine which are broken. find_broken_links(doc) sort_links set_total_links_crawled @broken_links.any? end # Finds broken links within an entire site and appends them to the # @broken_links array. Returns a tuple containing a Boolean of true if # at least one broken link was found and an Array of all pages crawled. # Access the broken links with Finder#broken_links. def crawl_site(url) clear_links url = url.to_url pool = Thread.pool(@max_threads) crawled_pages = [] # Crawl the site's HTML web pages looking for links. orig_doc = @crawler.crawl_site(url) do |doc| crawled_pages << doc.url next unless doc # Start a thread for each page, checking for broken links. pool.process { find_broken_links(doc) } end # Ensure the given website url is valid. raise "Invalid or broken URL: #{url}" if orig_doc.nil? # Wait for all threads to finish. pool.shutdown sort_links set_total_links_crawled [@broken_links.any?, crawled_pages.uniq] end # Pretty prints the link report into a stream e.g. STDOUT or a file, # anything that respond_to? :puts. Defaults to STDOUT. # Returns true if there were broken links and vice versa. def pretty_print_link_report( stream = STDOUT, broken_verbose: true, ignored_verbose: false ) reporter = BrokenLinkFinder::Reporter.new( stream, @sort, @broken_links, @ignored_links ) reporter.pretty_print_link_report( broken_verbose: broken_verbose, ignored_verbose: ignored_verbose ) @broken_links.any? end private # Finds which links are unsupported or broken and records the details. def find_broken_links(doc) # Report and reject any non supported links. links = doc.all_links .reject do |link| if link.is_absolute? && !link.start_with?('http') append_ignored_link(doc.url, link) true end end .uniq # Iterate over the supported links checking if they're broken or not. links.each do |link| # Check if the link has already been processed previously. next if @all_intact_links.include?(link) if @all_broken_links.include?(link) append_broken_link(doc.url, link) next end # The link hasn't been processed before so we crawl it. link_url = get_absolute_link(doc, link) link_doc = @crawler.crawl_url(link_url) # Determine if the crawled link is broken or not. if @crawler.last_response.is_a?(Net::HTTPNotFound) || link_doc.nil? || has_broken_anchor(link_doc) append_broken_link(doc.url, link) else @lock.synchronize { @all_intact_links << link } end end nil end # Returns the link in absolute form so it can be crawled. def get_absolute_link(doc, link) link.is_relative? ? doc.base_url(link: link).concat(link) : link end # Returns true if the link is/contains a broken anchor. def has_broken_anchor(doc) raise 'link document is nil' unless doc anchor = doc.url.anchor return false if anchor.nil? || (anchor == '#') anchor = anchor[1..-1] if anchor.start_with?('#') doc.xpath("//*[@id='#{anchor}']").empty? end # Append key => [value] to @broken_links. def append_broken_link(url, link) key, value = get_key_value(url, link) @lock.synchronize do @broken_links[key] = [] unless @broken_links[key] @broken_links[key] << value @all_broken_links << link end end # Append key => [value] to @ignored_links. def append_ignored_link(url, link) key, value = get_key_value(url, link) @lock.synchronize do @ignored_links[key] = [] unless @ignored_links[key] @ignored_links[key] << value end end # Returns the correct key value depending on the @sort type. # @sort == :page ? [url, link] : [link, url] def get_key_value(url, link) case @sort when :page [url, link] when :link [link, url] else raise "Unsupported sort type: #{sort}" end end # Sort keys and values alphabetically. def sort_links @broken_links.values.map(&:uniq!) @ignored_links.values.map(&:uniq!) @broken_links = @broken_links.sort_by { |k, _v| k }.to_h @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h @broken_links.each { |_k, v| v.sort! } @ignored_links.each { |_k, v| v.sort! } end # Sets and returns the total number of links crawled. def set_total_links_crawled @total_links_crawled = @all_broken_links.size + @all_intact_links.size end alias crawl_page crawl_url alias crawl_r crawl_site alias pretty_print_link_summary pretty_print_link_report end end