# frozen_string_literal: true require 'nokogiri' require 'rexml/document' require 'find' require 'net/http' require_relative 'rdoc_link_checker/version' class RDocLinkChecker include REXML attr_accessor :html_dirpath, :onsite_only, :no_toc attr_accessor :source_paths, :pages def initialize( html_dirpath, onsite_only: false, no_toc: false ) self.html_dirpath = html_dirpath self.onsite_only = onsite_only self.no_toc = no_toc self.pages = {} @counts = { source_pages: 0, target_pages: 0, links_checked: 0, links_broken: 0, } end def check # All work is done in the HTML directory, # and that is where Report.htm will be put. Dir.chdir(html_dirpath) do |dir| @counts[:start_time] = Time.new gather_source_paths create_source_pages create_target_pages verify_links @counts[:end_time] = Time.new report end end # Gather paths to source HTML pages. def gather_source_paths paths = [] paths = Find.find('.').select {|path| path.end_with?('.html') } # Remove leading './'. self.source_paths = paths.map{|path| path.sub(%r[^\./], '')} @counts[:source_pages] = source_paths.size end # Create a source \Page object for each source path. # Gather its links and ids. def create_source_pages source_paths.sort.each_with_index do |source_path, i| progress_s = RDocLinkChecker.progress_s(i + 1, source_paths.size) source_page = Page.new(:source, source_path, onsite_only, pages: pages, counts: @counts) pages[source_path] = source_page source_text = File.read(source_path) doc = Nokogiri::HTML(source_text) source_page.gather_links(doc) unless no_toc source_page.gather_ids(doc) end end # Create a target \Page object for each link # (unless already created as a source page). def create_target_pages doc = nil target_page_count = 0 source_paths = pages.keys source_paths.each do |source_path| # Need for relative links to work. dirname = File.dirname(source_path) Dir.chdir(dirname) do source_page = pages[source_path] source_page.links.each_with_index do |link, i| next if link.path.nil? target_path = link.real_path if pages[target_path] target_page = pages[target_path] else target_page_count += 1 target_page = Page.new(:target, target_path, onsite_only, pages: pages, counts: @counts) pages[target_path] = target_page if File.readable?(link.path) target_text = File.read(link.path) doc = Nokogiri::HTML(target_text) target_page.gather_ids(doc) elsif RDocLinkChecker.checkable?(link.path) link.exception = fetch(link.path, target_page) link.valid_p = false if link.exception else # File not readable or checkable. end end next if target_page.nil? if link.has_fragment? && target_page.ids.empty? doc || doc = Nokogiri::HTML(target_text) target_page.gather_ids(doc) end end end end @counts[:target_pages] = target_page_count end # Verify that each link target exists. def verify_links linking_pages = pages.select do |path, page| !page.links.empty? end link_count = 0 broken_count = 0 linking_pages.each_pair do |path, page| link_count += page.links.size page.links.each_with_index do |link, i| if link.valid_p.nil? # Don't disturb if already set to false. target_page = pages[link.real_path] if target_page target_id = link.fragment link.valid_p = target_id.nil? || target_page.ids.include?(target_id) else link.valid_p = false end end broken_count += 1 unless link.valid_p end end @counts[:links_checked] = link_count @counts[:links_broken] = broken_count end # Fetch the page from the web and gather its ids into the target page. # Returns exception or nil. def fetch(url, target_page) code = 0 exception = nil begin response = Net::HTTP.get_response(URI(url)) code = response.code.to_i target_page.code = code rescue => x raise unless x.class.name.match(/^(Net|SocketError|IO::TimeoutError|Errno::)/) exception = RDocLinkChecker::HttpResponseError.new(url, x) end # Don't load if bad code, or no response, or if not html. if !code_bad?(code) if content_type_html?(response) doc = Nokogiri::HTML(response.body) target_page.gather_ids(doc) end end exception end # Returns whether the code is bad (zero or >= 400). def code_bad?(code) return false if code.nil? (code == 0) || (code >= 400) end # Returns whether the response body should be HTML. def content_type_html?(response) return false unless response return false unless response['Content-Type'] response['Content-Type'].match('html') end # Returns whether the path is offsite. def self.offsite?(path) path.start_with?('http') end # Returns the string fragment for the given path or ULR, or +nil+ def self.get_fragment(s) a = s.split('#', 2) a.size == 2 ? a[1] : nil end # Returns a progress string giving a fraction and percentage. def self.progress_s(i, total) fraction_s = "#{i}/#{total}" percent_i = (i*100.0/total).round "(#{fraction_s}, #{percent_i}%)" end # Returns whether the path is checkable. def self.checkable?(path) return false unless path begin uri = URI(path) return ['http', 'https', nil].include?(uri.scheme) rescue return false end end # Generate the report; +checker+ is the \RDocLinkChecker object. def report doc = Document.new('') root = doc.add_element(Element.new('root')) head = root.add_element(Element.new('head')) title = head.add_element(Element.new('title')) title.text = 'RDocLinkChecker Report' style = head.add_element(Element.new('style')) style.text = < :label, value => :good} data.push(row) end table2(body, data, 'parameters', 'Parameters') body.add_element(Element.new('p')) # Times table. elapsed_time = @counts[:end_time] - @counts[:start_time] seconds = elapsed_time % 60 minutes = (elapsed_time / 60) % 60 hours = (elapsed_time/3600) elapsed_time_s = "%2.2d:%2.2d:%2.2d" % [hours, minutes, seconds] format = "%Y-%m-%d-%a-%H:%M:%SZ" start_time_s = @counts[:start_time].strftime(format) end_time_s = @counts[:end_time].strftime(format) data = [ {'Start Time' => :label, start_time_s => :good}, {'End Time' => :label, end_time_s => :good}, {'Elapsed Time' => :label, elapsed_time_s => :good}, ] table2(body, data, 'times', 'Times') body.add_element(Element.new('p')) # Counts. data = [ {'Source Pages' => :label, @counts[:source_pages] => :good}, {'Target Pages' => :label, @counts[:target_pages] => :good}, {'Links Checked' => :label, @counts[:links_checked] => :good}, {'Links Broken' => :label, @counts[:links_broken] => :bad}, ] table2(body, data, 'counts', 'Counts') body.add_element(Element.new('p')) end def add_broken_links(body) h2 = body.add_element(Element.new('h2')) h2.text = 'Broken Links by Source Page' if @counts[:links_broken] == 0 p = body.add_element('p') p.text = 'None.' return end # Legend. ul = body.add_element(Element.new('ul')) li = ul.add_element(Element.new('li')) li.text = 'Href: the href of the anchor element.' li = ul.add_element(Element.new('li')) li.text = 'Text: the text of the anchor element.' li = ul.add_element(Element.new('li')) li.text = 'Path: the URL or path of the link (not including the fragment):' ul2 = li.add_element(Element.new('ul')) li2 = ul2.add_element(Element.new('li')) li2.text = 'For an on-site link, an abbreviated path is given.' li2 = ul2.add_element(Element.new('li')) li2.text = < :label, a => :bad}) data.push({'Text' => :label, link.text => :good}) fragment_p = !link.fragment.nil? class_ = fragment_p ? :good : :bad data.push({'Path' => :label, link.real_path => class_}) class_ = fragment_p ? :bad : :good data.push({'Fragment' => :label, link.fragment => class_}) if link.exception data.push({'Exception' => :label, link.exception.class => :bad}) data.push({'Message' => :label, link.exception.message => :bad}) end id = link.exception ? 'bad_url' : 'bad_fragment' table2(link_div, data, id) page_div.add_element(Element.new('p')) end end end def add_offsite_links(body) h2 = body.add_element(Element.new('h2')) h2.text = 'Off-Site Links by Source Page' none = true pages.each_pair do |path, page| offsite_links = page.links.select do |link| RDocLinkChecker.offsite?(link.href) end next if offsite_links.empty? none = false h3 = body.add_element(Element.new('h3')) a = Element.new('a') a.text = path a.add_attribute('href', path) h3.add_element(a) offsite_links.each do |link| data = [] # Text, URL, fragment a = Element.new('a') a.text = link.href a.add_attribute('href', link.href) class_ = link.valid_p ? :good : :bad data.push({'Href' => :label, a => class_}) data.push({'Text' => :label, link.text => :good}) table2(body, data) body.add_element(Element.new('p')) end end if none p = body.add_element(Element.new('p')) p.text = 'None.' end end Classes = { label: 'label center neutral', good: 'data center good', iffy: 'data center iffy', bad: 'data center bad', } def table2(parent, data, id, title = nil) data = data.dup table = parent.add_element(Element.new('table')) table.add_attribute('id', id) if title tr = table.add_element(Element.new('tr)')) th = tr.add_element(Element.new('th')) th.add_attribute('colspan', 2) if title.kind_of?(REXML::Element) th.add_element(title) else th.text = title end end data.each do |row_h| label, label_class, value, value_class = row_h.flatten tr = table.add_element(Element.new('tr')) td = tr.add_element(Element.new('td')) td.text = label td.add_attribute('class', Classes[label_class]) td = tr.add_element(Element.new('td')) if value.kind_of?(REXML::Element) td.add_element(value) else td.text = value end td.add_attribute('class', Classes[value_class]) end end class Error; end class HttpResponseError < Error attr_accessor :url, :x def initialize(url, x) self.url = url self.x = x end def message <