lib/site_checker.rb in site_checker-0.1.1 vs lib/site_checker.rb in site_checker-0.2.0.pre

- old
+ new

@@ -1,224 +1,108 @@ -require 'nokogiri' require 'open-uri' +require 'nokogiri' -class SiteChecker - attr_accessor :problems - attr_accessor :ignore_list, :visit_references, :max_recursion_depth +require 'site_checker/io/content_from_file_system' +require 'site_checker/io/content_from_web' +require 'site_checker/parse/page' +require 'site_checker/link' +require 'site_checker/link_collector' +require 'site_checker/dsl' - def initialize() - yield self if block_given? - @ignore_list ||= [] - @visit_references ||= false - @max_recursion_depth ||= -1 - end +module SiteChecker + class << self + attr_accessor :ignore_list + attr_accessor :visit_references + attr_accessor :max_recursion_depth + attr_accessor :dsl_enabled + attr_reader :link_collector - def check(url, root) - @visits = {} - @problems = {} - @recursion_depth = 0 - - @root = root - - register_visit(:local_page, url) - process_local_page(url, nil) - end - - def local_pages - @visits[:local_page] - end - - def remote_pages - @visits[:remote_page] - end - - def local_images - @visits[:local_image] - end - - def remote_images - @visits[:remote_image] - end - - private - def process_local_page(url, parent_url) - links = collect_links(url, parent_url) - - filter_out_working_anchors!(links) - report_and_remove_anchors!(links, parent_url) - - links.each do |link, kind| - if kind != :anchor - visit(kind, url, link) unless visited?(kind, link) - else - end + ## + # The following configuration options, which can be used together, are available: + # + # - ignoring certain links: + # + # SiteChecker.configure do |config| + # config.ignore_list = ["/", "/atom.xml"] + # end + # + # - visit the external references as well: + # + # SiteChecker.configure do |config| + # config.visit_references = true + # end + # + # - set the depth of the recursion: + # + # SiteChecker.configure do |config| + # config.max_recursion_depth = 3 + # end + def configure + yield self end - end - def register_visit(kind, link) - @visits[kind] = [] unless @visits.has_key?(kind) - @visits[kind] << link - end - - def visited?(kind, link) - @visits[kind] = [] unless @visits.has_key?(kind) - @visits[kind].include?(link) - end - - def visit(kind, parent_url, link) - register_visit(kind, link) - if kind != :local_page - open_reference(kind, link, parent_url) - else - unless stop_recursion? - @recursion_depth += 1 - process_local_page(link, parent_url) - @recursion_depth -= 1 - end - end - end - - def open_reference(kind, link, parent_url) - content = nil - begin - if kind == :local_page - if URI(@root).absolute? - content = open(link) - else - link = add_index_html(link) - content = File.open(link).read - end - elsif kind == :local_image - if URI(@root).absolute? - open(link) - else - File.open(link) - end - elsif @visit_references - open(link) - end - rescue OpenURI::HTTPError => e - new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})") - rescue Errno::ENOENT => e - link = remove_index_html(link) if kind == :local_page - new_problem(strip_root(parent_url), "#{strip_root(link)} (404 Not Found)") - rescue => e - new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})") + ## + # Recursively visits the provided url looking for reference problems. + # + # @param [String] url where the processing starts + # @param [String] root the root URL of the site + # + def check(url, root) + create_instance + @link_collector.check(url, root) end - content - end - def filter_out_working_anchors!(links) - links.delete_if{ |link, kind| (kind == :local_page && has_anchor?(links, link)) } - end - - def report_and_remove_anchors!(links, parent_url) - anchors = links.select {|link, kind| link.match(/^.+#.+$/) && kind == :local_page} - anchors.each do |anchor, kind| - new_problem(strip_root(parent_url), "#{strip_root(anchor)} (404 Not Found)") - links.delete(anchor) + ## + # Returns the Array of the visited local pages. + # + # @return [Array] list of the visited local pages + # + def local_pages + @link_collector.local_pages end - end - def has_anchor?(links, link) - anchor = link.gsub(/^.+#/, "") - links.has_key?(anchor) && links[anchor] == :anchor - end - - - def absolute_reference?(link) - link.start_with?(@root) - end - - def relative_reference?(link) - link =~ /^\/.+/ - end - - def collect_links(url, parent_url) - links = {} - content = open_reference(:local_page, url, parent_url) - if content - doc = Nokogiri(content) - doc.xpath("//img").reject {|img| ignored?(img['src'])}.each do |img| - link_kind = detect_link_and_kind(img['src'], url, :remote_image, :local_image) - links.merge!(link_kind) unless link_kind.empty? - end - doc.xpath("//a").reject {|a| ignored?(a['href'])}.each do |a| - link_kind = detect_link_and_kind(a['href'], url, :remote_page, :local_page) - links.merge!(link_kind) unless link_kind.empty? - end - - doc.xpath("//a").reject {|a| !a['id']}.each do |a| - links.merge!({a['id'] => :anchor}) - end + ## + # Returns the Array of the visited remote (external) pages. + # + # @return [Array] list of the visited remote pages + # + def remote_pages + @link_collector.remote_pages end - links - end - def detect_link_and_kind(reference, url, external_kind, local_kind) - link_kind = {} - link = URI(strip_trailing_slash(reference)) - if link.to_s.start_with?(@root) - new_problem(url, "#{link} (absolute path)") - else - if URI(reference).absolute? - link_kind[link.to_s] = external_kind - else - link_kind[create_absolute_reference(link.to_s)] = local_kind - end + ## + # Returns the Array of the visited local images. + # + # @return [Array] list of the visited local images + # + def local_images + @link_collector.local_images end - link_kind - end - def strip_trailing_slash(link) - link.gsub(/\/$/, "") - end - - def strip_root(link) - if link - link.gsub(/^#{@root}[\/]?/, "") - else - "" + ## + # Returns the Array of the visited remote (external) images. + # + # @return [Array] list of the visited remote images + # + def remote_images + @link_collector.remote_images end - end - def add_index_html(path) - path.end_with?(".html") ? path : File.join(path, "index.html") - end - - def remove_index_html(path) - path.gsub(/\/index.html$/, "") - end - - def create_absolute_reference(link) - root = URI(@root) - if root.absolute? - root.merge(link).to_s.gsub(/\/$/, "") - else - File.join(root.path, link) + ## + # Returns the Hash (:parent_url => [Array of problematic links]) of the problems. + # + # @return [Hash] the result of the check + # + def problems + @link_collector.problems end - end - def new_problem(url, message) - url = @root if url.empty? - @problems[url] = [] unless problems.has_key?(url) - @problems[url] << message - end - - def ignored?(link) - if link - @ignore_list.include? link - else - true + private + def create_instance + @link_collector = SiteChecker::LinkCollector.new do |config| + config.visit_references = @visit_references if @visit_references + config.ignore_list = @ignore_list if @ignore_list + config.max_recursion_depth = @max_recursion_depth if @max_recursion_depth + end end end - - def stop_recursion? - if @max_recursion_depth == -1 - false - elsif @max_recursion_depth > @recursion_depth - false - else - true - end - end -end +end \ No newline at end of file