# frozen_string_literal: true module BrokenLinkFinder DEFAULT_MAX_THREADS = 100 SERVER_WAIT_TIME = 0.5 # Alias for BrokenLinkFinder::Finder.new. def self.new(sort: :page, max_threads: DEFAULT_MAX_THREADS) Finder.new(sort: sort, max_threads: max_threads) end class Finder attr_reader :sort, :max_threads, :broken_links, :ignored_links, :crawl_stats # Creates a new Finder instance. def initialize(sort: :page, max_threads: BrokenLinkFinder::DEFAULT_MAX_THREADS) raise "Sort by either :page or :link, not #{sort}" \ unless %i[page link].include?(sort) @sort = sort @max_threads = max_threads @lock = Mutex.new @crawler = Wgit::Crawler.new reset_crawl end # Clear/empty the link collection objects. def reset_crawl @broken_links = {} # Used for mapping pages to broken links. @ignored_links = {} # Used for mapping pages to ignored links. @all_broken_links = Set.new # Used to prevent crawling a broken link twice. @all_intact_links = Set.new # Used to prevent crawling an intact link twice. @all_ignored_links = Set.new # Used for building crawl statistics. @broken_link_map = {} # Maps a link to its absolute (crawlable) form. @crawl_stats = {} # Records crawl stats e.g. duration etc. end # Finds broken links within a single page and records them. # Returns true if at least one broken link was found. # Access the broken links afterwards with Finder#broken_links. def crawl_url(url) reset_crawl start = Time.now url = url.to_url # We dup the url to avoid recording any redirects. doc = @crawler.crawl(url.dup) # Ensure the given page url is valid. raise "Invalid or broken URL: #{url}" unless doc # Get all page links and determine which are broken. find_broken_links(doc) retry_broken_links sort_links set_crawl_stats(url: url, pages_crawled: [url], start: start) @broken_links.any? end # Finds broken links within an entire site and records them. # Returns true if at least one broken link was found. # Access the broken links afterwards with Finder#broken_links. def crawl_site(url) reset_crawl start = Time.now url = url.to_url pool = Thread.pool(@max_threads) crawled = Set.new # Crawl the site's HTML web pages looking for links. # We dup the url to avoid recording any redirects. externals = @crawler.crawl_site(url.dup) do |doc| crawled << doc.url next unless doc # Start a thread for each page, checking for broken links. pool.process { find_broken_links(doc) } end # Ensure the given website url is valid. raise "Invalid or broken URL: #{url}" unless externals # Wait for all threads to finish. pool.shutdown retry_broken_links sort_links set_crawl_stats(url: url, pages_crawled: crawled.to_a, start: start) @broken_links.any? end # Outputs the link report into a stream e.g. STDOUT or a file, # anything that respond_to? :puts. Defaults to STDOUT. def report(stream = STDOUT, type: :text, broken_verbose: true, ignored_verbose: false) klass = case type when :text BrokenLinkFinder::TextReporter when :html BrokenLinkFinder::HTMLReporter else raise "The type: must be :text or :html, not: :#{type}" end reporter = klass.new(stream, @sort, @broken_links, @ignored_links, @broken_link_map, @crawl_stats) reporter.call(broken_verbose: broken_verbose, ignored_verbose: ignored_verbose) end private # Finds which links are unsupported or broken and records the details. def find_broken_links(page) process_unparsable_links(page) # Record them as broken. links = get_supported_links(page) # Iterate over the supported links checking if they're broken or not. links.each do |link| # Skip if the link has been encountered previously. next if @all_intact_links.include?(link) if @all_broken_links.include?(link) # The link has already been proven broken so simply record it. append_broken_link(page, link, map: false) next end # The link hasn't been encountered before so we crawl it. link_doc = crawl_link(page, link) # Determine if the crawled link is broken or not and record it. if link_broken?(link_doc) append_broken_link(page, link) else # Record it as being intact. @lock.synchronize { @all_intact_links << link } end end nil end # Record each unparsable link as a broken link. def process_unparsable_links(doc) doc.unparsable_links.each do |link| append_broken_link(doc, link, map: false) @broken_link_map[link] = link end end # Implements a retry mechanism for each of the broken links found. # Removes any broken links found to be working OK. def retry_broken_links sleep(SERVER_WAIT_TIME) # Give the servers a break, then retry the links. @broken_link_map.select! do |link, href| # Don't retry unparsable links (which are Strings). next(true) unless href.is_a?(Wgit::Url) doc = @crawler.crawl(href.dup) if link_broken?(doc) true else remove_broken_link(link) false end end end # Report and reject any non supported links. Any link that is absolute and # doesn't start with 'http' is unsupported e.g. 'mailto:blah' etc. def get_supported_links(doc) doc.all_links .reject do |link| if link.is_absolute? && !link.start_with?('http') append_ignored_link(doc.url, link) true end end end # Make the link absolute and crawl it, returning its Wgit::Document. def crawl_link(doc, link) link = link.prefix_base(doc) @crawler.crawl(link.dup) # We dup link to avoid recording any redirects. end # Return if the crawled link is broken or not. def link_broken?(doc) doc.nil? || @crawler.last_response.not_found? || has_broken_anchor(doc) end # Returns true if the link is/contains a broken anchor/fragment. # E.g. /about#top should contain a HTML element with an @id of 'top' etc. def has_broken_anchor(doc) raise 'The link document is nil' unless doc fragment = doc.url.fragment return false if fragment.nil? || fragment.empty? doc.xpath("//*[@id='#{fragment}']").empty? end # Append key => [value] to the broken link collections. # If map: true, then the link will also be recorded in @broken_link_map. def append_broken_link(doc, link, map: true) key, value = get_key_value(doc.url, link) @lock.synchronize do @broken_links[key] = [] unless @broken_links[key] @broken_links[key] << value @all_broken_links << link @broken_link_map[link] = link.prefix_base(doc) if map end end # Remove the broken link from the necessary collections. def remove_broken_link(link) @lock.synchronize do if @sort == :page @broken_links.each { |_k, links| links.delete(link) } @broken_links.delete_if { |_k, links| links.empty? } else @broken_links.delete(link) end @all_broken_links.delete(link) @all_intact_links << link end end # Append key => [value] to the ignored link collections. def append_ignored_link(url, link) key, value = get_key_value(url, link) @lock.synchronize do @ignored_links[key] = [] unless @ignored_links[key] @ignored_links[key] << value @all_ignored_links << link end end # Returns the correct key value depending on the @sort type. # @sort == :page ? [url, link] : [link, url] def get_key_value(url, link) case @sort when :page [url, link] when :link [link, url] else raise "Unsupported sort type: #{sort}" end end # Sort keys and values alphabetically. def sort_links @broken_links.values.map(&:uniq!) @ignored_links.values.map(&:uniq!) @broken_links = @broken_links.sort_by { |k, _v| k }.to_h @ignored_links = @ignored_links.sort_by { |k, _v| k }.to_h @broken_links.each { |_k, v| v.sort! } @ignored_links.each { |_k, v| v.sort! } end # Sets various statistics about the crawl and its links. def set_crawl_stats(url:, pages_crawled:, start:) @crawl_stats[:url] = url @crawl_stats[:pages_crawled] = pages_crawled @crawl_stats[:num_pages] = pages_crawled.size @crawl_stats[:num_links] = ( @all_broken_links.size + @all_intact_links.size + @all_ignored_links.size ) @crawl_stats[:num_broken_links] = @all_broken_links.size @crawl_stats[:num_intact_links] = @all_intact_links.size @crawl_stats[:num_ignored_links] = @all_ignored_links.size @crawl_stats[:duration] = Time.now - start end alias crawl_page crawl_url alias crawl_r crawl_site end end