lib/site_checker.rb in site_checker-0.1.1 vs lib/site_checker.rb in site_checker-0.2.0.pre
- old
+ new
@@ -1,224 +1,108 @@
-require 'nokogiri'
require 'open-uri'
+require 'nokogiri'
-class SiteChecker
- attr_accessor :problems
- attr_accessor :ignore_list, :visit_references, :max_recursion_depth
+require 'site_checker/io/content_from_file_system'
+require 'site_checker/io/content_from_web'
+require 'site_checker/parse/page'
+require 'site_checker/link'
+require 'site_checker/link_collector'
+require 'site_checker/dsl'
- def initialize()
- yield self if block_given?
- @ignore_list ||= []
- @visit_references ||= false
- @max_recursion_depth ||= -1
- end
+module SiteChecker
+ class << self
+ attr_accessor :ignore_list
+ attr_accessor :visit_references
+ attr_accessor :max_recursion_depth
+ attr_accessor :dsl_enabled
+ attr_reader :link_collector
- def check(url, root)
- @visits = {}
- @problems = {}
- @recursion_depth = 0
-
- @root = root
-
- register_visit(:local_page, url)
- process_local_page(url, nil)
- end
-
- def local_pages
- @visits[:local_page]
- end
-
- def remote_pages
- @visits[:remote_page]
- end
-
- def local_images
- @visits[:local_image]
- end
-
- def remote_images
- @visits[:remote_image]
- end
-
- private
- def process_local_page(url, parent_url)
- links = collect_links(url, parent_url)
-
- filter_out_working_anchors!(links)
- report_and_remove_anchors!(links, parent_url)
-
- links.each do |link, kind|
- if kind != :anchor
- visit(kind, url, link) unless visited?(kind, link)
- else
- end
+ ##
+ # The following configuration options, which can be used together, are available:
+ #
+ # - ignoring certain links:
+ #
+ # SiteChecker.configure do |config|
+ # config.ignore_list = ["/", "/atom.xml"]
+ # end
+ #
+ # - visit the external references as well:
+ #
+ # SiteChecker.configure do |config|
+ # config.visit_references = true
+ # end
+ #
+ # - set the depth of the recursion:
+ #
+ # SiteChecker.configure do |config|
+ # config.max_recursion_depth = 3
+ # end
+ def configure
+ yield self
end
- end
- def register_visit(kind, link)
- @visits[kind] = [] unless @visits.has_key?(kind)
- @visits[kind] << link
- end
-
- def visited?(kind, link)
- @visits[kind] = [] unless @visits.has_key?(kind)
- @visits[kind].include?(link)
- end
-
- def visit(kind, parent_url, link)
- register_visit(kind, link)
- if kind != :local_page
- open_reference(kind, link, parent_url)
- else
- unless stop_recursion?
- @recursion_depth += 1
- process_local_page(link, parent_url)
- @recursion_depth -= 1
- end
- end
- end
-
- def open_reference(kind, link, parent_url)
- content = nil
- begin
- if kind == :local_page
- if URI(@root).absolute?
- content = open(link)
- else
- link = add_index_html(link)
- content = File.open(link).read
- end
- elsif kind == :local_image
- if URI(@root).absolute?
- open(link)
- else
- File.open(link)
- end
- elsif @visit_references
- open(link)
- end
- rescue OpenURI::HTTPError => e
- new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
- rescue Errno::ENOENT => e
- link = remove_index_html(link) if kind == :local_page
- new_problem(strip_root(parent_url), "#{strip_root(link)} (404 Not Found)")
- rescue => e
- new_problem(strip_root(parent_url), "#{strip_root(link)} (#{e.message.strip})")
+ ##
+ # Recursively visits the provided url looking for reference problems.
+ #
+ # @param [String] url where the processing starts
+ # @param [String] root the root URL of the site
+ #
+ def check(url, root)
+ create_instance
+ @link_collector.check(url, root)
end
- content
- end
- def filter_out_working_anchors!(links)
- links.delete_if{ |link, kind| (kind == :local_page && has_anchor?(links, link)) }
- end
-
- def report_and_remove_anchors!(links, parent_url)
- anchors = links.select {|link, kind| link.match(/^.+#.+$/) && kind == :local_page}
- anchors.each do |anchor, kind|
- new_problem(strip_root(parent_url), "#{strip_root(anchor)} (404 Not Found)")
- links.delete(anchor)
+ ##
+ # Returns the Array of the visited local pages.
+ #
+ # @return [Array] list of the visited local pages
+ #
+ def local_pages
+ @link_collector.local_pages
end
- end
- def has_anchor?(links, link)
- anchor = link.gsub(/^.+#/, "")
- links.has_key?(anchor) && links[anchor] == :anchor
- end
-
-
- def absolute_reference?(link)
- link.start_with?(@root)
- end
-
- def relative_reference?(link)
- link =~ /^\/.+/
- end
-
- def collect_links(url, parent_url)
- links = {}
- content = open_reference(:local_page, url, parent_url)
- if content
- doc = Nokogiri(content)
- doc.xpath("//img").reject {|img| ignored?(img['src'])}.each do |img|
- link_kind = detect_link_and_kind(img['src'], url, :remote_image, :local_image)
- links.merge!(link_kind) unless link_kind.empty?
- end
- doc.xpath("//a").reject {|a| ignored?(a['href'])}.each do |a|
- link_kind = detect_link_and_kind(a['href'], url, :remote_page, :local_page)
- links.merge!(link_kind) unless link_kind.empty?
- end
-
- doc.xpath("//a").reject {|a| !a['id']}.each do |a|
- links.merge!({a['id'] => :anchor})
- end
+ ##
+ # Returns the Array of the visited remote (external) pages.
+ #
+ # @return [Array] list of the visited remote pages
+ #
+ def remote_pages
+ @link_collector.remote_pages
end
- links
- end
- def detect_link_and_kind(reference, url, external_kind, local_kind)
- link_kind = {}
- link = URI(strip_trailing_slash(reference))
- if link.to_s.start_with?(@root)
- new_problem(url, "#{link} (absolute path)")
- else
- if URI(reference).absolute?
- link_kind[link.to_s] = external_kind
- else
- link_kind[create_absolute_reference(link.to_s)] = local_kind
- end
+ ##
+ # Returns the Array of the visited local images.
+ #
+ # @return [Array] list of the visited local images
+ #
+ def local_images
+ @link_collector.local_images
end
- link_kind
- end
- def strip_trailing_slash(link)
- link.gsub(/\/$/, "")
- end
-
- def strip_root(link)
- if link
- link.gsub(/^#{@root}[\/]?/, "")
- else
- ""
+ ##
+ # Returns the Array of the visited remote (external) images.
+ #
+ # @return [Array] list of the visited remote images
+ #
+ def remote_images
+ @link_collector.remote_images
end
- end
- def add_index_html(path)
- path.end_with?(".html") ? path : File.join(path, "index.html")
- end
-
- def remove_index_html(path)
- path.gsub(/\/index.html$/, "")
- end
-
- def create_absolute_reference(link)
- root = URI(@root)
- if root.absolute?
- root.merge(link).to_s.gsub(/\/$/, "")
- else
- File.join(root.path, link)
+ ##
+ # Returns the Hash (:parent_url => [Array of problematic links]) of the problems.
+ #
+ # @return [Hash] the result of the check
+ #
+ def problems
+ @link_collector.problems
end
- end
- def new_problem(url, message)
- url = @root if url.empty?
- @problems[url] = [] unless problems.has_key?(url)
- @problems[url] << message
- end
-
- def ignored?(link)
- if link
- @ignore_list.include? link
- else
- true
+ private
+ def create_instance
+ @link_collector = SiteChecker::LinkCollector.new do |config|
+ config.visit_references = @visit_references if @visit_references
+ config.ignore_list = @ignore_list if @ignore_list
+ config.max_recursion_depth = @max_recursion_depth if @max_recursion_depth
+ end
end
end
-
- def stop_recursion?
- if @max_recursion_depth == -1
- false
- elsif @max_recursion_depth > @recursion_depth
- false
- else
- true
- end
- end
-end
+end
\ No newline at end of file