# frozen_string_literal: true require 'nokogiri' require 'rexml/document' require 'find' require 'net/http' require_relative 'rdoc_link_checker/version' class RDocLinkChecker include REXML attr_accessor :html_dirpath, :onsite_only, :no_toc def initialize( html_dirpath, onsite_only: false, no_toc: false ) self.html_dirpath = html_dirpath self.onsite_only = onsite_only self.no_toc = no_toc @pages = {} @counts = { source_pages: 0, target_pages: 0, links_checked: 0, links_broken: 0, } @verbose = false end def check # All work is done in the HTML directory, # and that is where Report.htm will be put. Dir.chdir(html_dirpath) do |dir| @counts[:start_time] = Time.now gather_source_paths create_source_pages create_target_pages verify_links @counts[:end_time] = Time.now report end end # Gather paths to source HTML pages. def gather_source_paths paths = [] puts 'Gathering source paths' if @verbose paths = Find.find('.').select {|path| path.end_with?('.html') } # Remove leading './'. @source_paths = paths.map{|path| path.sub(%r[^\./], '')} @source_paths.delete('table_of_contents.html') if no_toc if @verbose @source_paths.each_with_index do |source_path, i| puts '- %4d %s' % [i, source_path] end end @counts[:source_pages] = @source_paths.size puts "Gathered #{@source_paths.size} source paths" if @verbose end # Create a source \Page object for each source path. # Gather its links and ids. def create_source_pages puts "Creating #{@source_paths.size} source pages" if @verbose @source_paths.sort.each_with_index do |source_path, i| progress_s = RDocLinkChecker.progress_s(i + 1, @source_paths.size) puts "Creating source page #{source_path} #{progress_s}" if @verbose source_page = Page.new(source_path, @verbose, @pages, @counts, onsite_only) @pages[source_path] = source_page source_text = File.read(source_path) doc = Nokogiri::HTML(source_text) source_page.gather_links(doc) source_page.gather_ids(doc) puts "Created source page #{progress_s}" if @verbose end puts "Created #{@pages.size} source pages" if @verbose end # Create a target \Page object for each link # (unless already created as a source page). def create_target_pages doc = nil target_page_count = 0 @source_paths = @pages.keys @source_paths.each do |source_path| # Need for relative links to work. dirname = File.dirname(source_path) Dir.chdir(dirname) do source_page = @pages[source_path] puts "Creating target pages for #{source_page.links.size} links in #{source_path}" if @verbose source_page.links.each_with_index do |link, i| next if link.path.nil? link.puts(i) if @verbose target_path = link.real_path if @pages[target_path] puts "Page #{target_path} already created" if @verbose target_page = @pages[target_path] else if File.readable?(link.path) puts "Creating target page #{target_path}" if @verbose target_page_count += 1 target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only) @pages[target_path] = target_page target_text = File.read(link.path) doc = Nokogiri::HTML(target_text) target_page.gather_ids(doc) puts "Created target page #{target_path}" if @verbose elsif RDocLinkChecker.checkable?(link.path) puts "Creating target page #{target_path}" if @verbose target_page_count += 1 target_page = Page.new(target_path, @verbose, @pages, @counts, onsite_only) @pages[target_path] = target_page puts "Created target page #{target_path}" if @verbose link.exception = fetch(link.path, target_page) link.valid_p = false if link.exception else puts "File not readable or checkable: #{target_path}" if @verbose end end next if target_page.nil? if link.has_fragment? && target_page.ids.empty? doc || doc = Nokogiri::HTML(target_text) target_page.gather_ids(doc) end end puts "Created target pages for #{source_page.links.size} links in #{source_path}" if @verbose end end puts "Created #{target_page_count} target pages" if @verbose @counts[:target_pages] = target_page_count end # Verify that each link target exists. def verify_links linking_pages = @pages.select do |path, page| !page.links.empty? end puts "Checking links on #{linking_pages.size} pages" if @verbose link_count = 0 broken_count = 0 linking_pages.each_pair do |path, page| puts "Checking #{page.links.size} links on page #{path}" if @verbose link_count += page.links.size page.links.each_with_index do |link, i| if link.valid_p.nil? # Don't disturb if already set to false. target_page = @pages[link.real_path] if target_page target_id = link.fragment link.valid_p = target_id.nil? || target_page.ids.include?(target_id) else link_valid_p = false end end link.puts(i) if @verbose broken_count += 1 unless link.valid_p end puts "Checked #{page.links.size} links on page #{path}" if @verbose end puts "Checked #{link_count} links on #{linking_pages.size} pages" if @verbose @counts[:links_checked] = link_count @counts[:links_broken] = broken_count end # Fetch the page from the web and gather its ids into the target page. # Returns exception or nil. def fetch(url, target_page) puts "Begin fetch target page #{url}" if @verbose puts "Getting return code for #{url}" if @verbose code = 0 exception = nil begin response = Net::HTTP.get_response(URI(url)) code = response.code.to_i target_page.code = code puts "Returned #{code} (#{response.class})" if @verbose rescue => x puts "Raised #{x.class} #{x.message}" if @verbose raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/) exception = RDocLinkChecker::HttpResponseError.new(url, x) end puts "Got return code #{code} for #{url} " if @verbose # Don't load if bad code, or no response, or if not html. if !code_bad?(code) if content_type_html?(response) doc = Nokogiri::HTML(response.body) target_page.gather_ids(doc) end end puts "End fetch target page #{url}" if @verbose exception end # Returns whether the code is bad (zero or >= 400). def code_bad?(code) return false if code.nil? (code == 0) || (code >= 400) end # Returns whether the response body should be HTML. def content_type_html?(response) return false unless response return false unless response['Content-Type'] response['Content-Type'].match('html') end # Returns whether the path is offsite. def self.offsite?(path) path.start_with?('http') end # Returns the string fragment for the given path or ULR, or +nil+ def self.get_fragment(s) a = s.split('#', 2) a.size == 2 ? a[1] : nil end # Returns a progress string giving a fraction and percentage. def self.progress_s(i, total) fraction_s = "#{i}/#{total}" percent_i = (i*100.0/total).round "(#{fraction_s}, #{percent_i}%)" end # Returns whether the path is checkable. def self.checkable?(path) return false unless path begin uri = URI(path) return ['http', 'https', nil].include?(uri.scheme) rescue return false end end # Generate the report; +checker+ is the \RDocLinkChecker object. def report doc = Document.new('') root = doc.add_element(Element.new('root')) head = root.add_element(Element.new('head')) title = head.add_element(Element.new('title')) title.text = 'RDocLinkChecker Report' style = head.add_element(Element.new('style')) style.text = < :label, value => :good} data.push(row) end table2(body, data, 'Parameters') body.add_element(Element.new('p')) # Times table. elapsed_time = @counts[:end_time] - @counts[:start_time] seconds = elapsed_time % 60 minutes = (elapsed_time / 60) % 60 hours = (elapsed_time/3600) elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds] format = "%Y-%m-%d-%a-%H:%M:%S" start_time_s = @counts[:start_time].strftime(format) end_time_s = @counts[:end_time].strftime(format) data = [ {'Start Time' => :label, start_time_s => :good}, {'End Time' => :label, end_time_s => :good}, {'Elapsed Time' => :label, elapsed_time_s => :good}, ] table2(body, data, 'Times') body.add_element(Element.new('p')) # Counts. data = [ {'Source Pages' => :label, @counts[:source_pages] => :good}, {'Target Pages' => :label, @counts[:target_pages] => :good}, {'Links Checked' => :label, @counts[:links_checked] => :good}, {'Links Broken' => :label, @counts[:links_broken] => :bad}, ] table2(body, data, 'Counts') body.add_element(Element.new('p')) end def add_broken_links(body) h2 = body.add_element(Element.new('h2')) h2.text = 'Broken Links by Source Page' if @counts[:links_broken] == 0 p = body.add_element('p') p.text = 'None.' return end ul = body.add_element(Element.new('ul')) li = ul.add_element(Element.new('li')) li.text = 'Href: the href of the anchor element.' li = ul.add_element(Element.new('li')) li.text = 'Text: the text of the anchor element.' li = ul.add_element(Element.new('li')) li.text = 'Path: the URL or path of the link (not including the fragment):' ul2 = li.add_element(Element.new('ul')) li2 = ul2.add_element(Element.new('li')) li2.text = 'For an on-site link, an abbreviated path is given.' li2 = ul2.add_element(Element.new('li')) li2.text = < :label, a => :bad}) data.push({'Text' => :label, link.text => :good}) fragment_p = !link.fragment.nil? class_ = fragment_p ? :good : :bad data.push({'Path' => :label, link.real_path => class_}) class_ = fragment_p ? :bad : :good data.push({'Fragment' => :label, link.fragment => class_}) if link.exception data.push({'Exception' => :label, link.exception.class => :bad}) data.push({'Message' => :label, link.exception.message => :bad}) end table2(body, data) body.add_element(Element.new('p')) end end end def add_offsite_links(body) h2 = body.add_element(Element.new('h2')) h2.text = 'Off-Site Links by Source Page' @pages.each_pair do |path, page| offsite_links = page.links.select do |link| RDocLinkChecker.offsite?(link.href) end next if offsite_links.empty? h3 = body.add_element(Element.new('h3')) a = Element.new('a') a.text = path a.add_attribute('href', path) h3.add_element(a) offsite_links.each do |link| data = [] # Text, URL, fragment a = Element.new('a') a.text = link.href a.add_attribute('href', link.href) class_ = link.valid_p ? :good : :bad data.push({'Href' => :label, a => class_}) data.push({'Text' => :label, link.text => :good}) table2(body, data) body.add_element(Element.new('p')) end end end Classes = { label: 'label center neutral', good: 'data center good', iffy: 'data center iffy', bad: 'data center bad', } def table2(parent, data, title = nil) data = data.dup table = parent.add_element(Element.new('table')) if title tr = table.add_element(Element.new('tr)')) th = tr.add_element(Element.new('th')) th.add_attribute('colspan', 2) if title.kind_of?(REXML::Element) th.add_element(title) else th.text = title end end data.each do |row_h| label, label_class, value, value_class = row_h.flatten tr = table.add_element(Element.new('tr')) td = tr.add_element(Element.new('td')) td.text = label td.add_attribute('class', Classes[label_class]) td = tr.add_element(Element.new('td')) if value.kind_of?(REXML::Element) td.add_element(value) else td.text = value end td.add_attribute('class', Classes[value_class]) end end class Error; end class HttpResponseError < Error attr_accessor :url, :x def initialize(url, x) self.url = url self.x = x end def message <