lib/link_checker.rb in link-checker-0.6.0 vs lib/link_checker.rb in link-checker-0.7.1

- old
+ new

@@ -6,36 +6,52 @@ require 'colorize' require 'anemone' class LinkChecker + # Create a new instance of LinkChecker + # + # @param params [Hash] A hash containing the :target value, which can represent either + # a file path or a URL. And an optional :options value, which contains a hash with a + # list of possible optional paramters. This can include :no_warnings, :warnings_are_errors, + # or :max_threads def initialize(params) @options = params[:options] || { } @target = params[:target] || './' @html_files = [] @links = [] @errors = [] @warnings = [] @return_code = 0 - @options[:max_threads] ||= 100 # Only happens in testing. + @options[:max_threads] ||= 100 end + # Find a list of HTML files in the @target path, which was set in the {#initialize} method. def html_file_paths Find.find(@target).map {|path| FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil - }.reject{|path| path.nil?} + }.reject{|path| path.nil? } end + # Find a list of all external links in the specified target, represented as URI strings. + # + # @param source [String] Either a file path or a URL. + # @return [Array] A list of URI strings. def self.external_link_uri_strings(source) Nokogiri::HTML(source).css('a').select {|link| !link.attribute('href').nil? && link.attribute('href').value =~ /^https?\:\/\// - }.map{|link| link.attributes['href'].value} + }.map{|link| link.attributes['href'].value } end + # Check one URL. + # + # @param uri [URI] A URI object for the target URL. + # @return [LinkChecker::Result] One of the following objects: {LinkChecker::Good}, + # {LinkChecker::Redirect}, or {LinkChecker::Error}. def self.check_uri(uri, redirected=false) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true if uri.scheme == "https" http.start do path = (uri.path.empty?) ? '/' : uri.path @@ -54,10 +70,13 @@ end end end end + # Check the URLs in the @target, either using {#check_uris_by_crawling} or + # {#check_uris_in_files}, depending on whether the @target looks like an http:// URL or + # a file path. def check_uris begin if @target =~ /^https?\:\/\// check_uris_by_crawling else @@ -82,58 +101,72 @@ end @return_code end + # Use {http://anemone.rubyforge.org Anemone} to crawl the pages at the @target URL, + # and then check all of the external URLs in those pages. def check_uris_by_crawling threads = [] Anemone.crawl(@target) do |anemone| anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore') anemone.on_every_page do |crawled_page| raise StandardError.new(crawled_page.error) if crawled_page.error - threads << start_link_check_thread(crawled_page.body, crawled_page.url.to_s) + threads << check_page(crawled_page.body, crawled_page.url.to_s) @html_files << crawled_page end end threads.each{|thread| thread.join } end + # Treat the @target as a file path and find all HTML files under that path, and then + # scan all of the external URLs in those files. def check_uris_in_files threads = [] html_file_paths.each do |file| wait_to_spawn_thread - threads << start_link_check_thread(open(file), file) + threads << check_page(open(file), file) @html_files << file end threads.each{|thread| thread.join } end - def start_link_check_thread(source, source_name) + # Spawn a thread to check an HTML page, and then spawn a thread for checking each + # link within that page. + # + # @param source [String] The contents of the HTML page, as a string. + # @param source_name [String] The name of the source, which will be reported if + # there is an error or a warning. + def check_page(page, page_name) Thread.new do threads = [] results = [] - self.class.external_link_uri_strings(source).each do |uri_string| - Thread.exclusive { @links << source } + self.class.external_link_uri_strings(page).each do |uri_string| + Thread.exclusive { @links << page } wait_to_spawn_thread threads << Thread.new do begin uri = URI(uri_string) response = self.class.check_uri(uri) response.uri_string = uri_string Thread.exclusive { results << response } rescue => error Thread.exclusive { results << - Error.new(:error => error.to_s, :uri_string => uri_string) } + Error.new( :error => error.to_s, :uri_string => uri_string) } end end end threads.each {|thread| thread.join } - report_results(source_name, results) + report_results(page_name, results) end end - - def report_results(file, results) + + # Report the results of scanning one HTML page. + # + # @param page_name [String] The name of the page. + # @param results [Array] An array of {LinkChecker::Result} objects. + def report_results(page_name, results) errors = results.select{|result| result.class.eql? Error} warnings = results.select{|result| result.class.eql? Redirect} @return_code = 1 unless errors.empty? if @options[:warnings_are_errors] @return_code = 1 unless warnings.empty? @@ -145,11 +178,11 @@ # This must be thread-exclusive to avoid a race condition. @errors = @errors.concat(errors) @warnings = @warnings.concat(warnings) if errors.empty? - message = "Checked: #{file}" + message = "Checked: #{page_name}" if warnings.empty? || @options[:no_warnings] puts message.green else puts message.yellow end @@ -158,11 +191,11 @@ puts " Warning: #{warning.uri_string}".yellow puts " Redirected to: #{warning.final_destination_uri_string}".yellow end end else - puts "Problem: #{file}".red + puts "Problem: #{page_name}".red errors.each do |error| puts " Link: #{error.uri_string}".red case error when Redirect puts " Redirected to: #{error.final_destination_uri_string}".red @@ -172,39 +205,58 @@ end end end end + # Abstract base class for representing the results of checking one URI. class Result attr_accessor :uri_string + + # A new LinkChecker::Result object instance. + # + # @param params [Hash] A hash of parameters. Expects :uri_string. def initialize(params) @uri_string = params[:uri_string] end end + # A good result. The URL is valid. class Good < Result end + # A redirection to another URL. class Redirect < Result attr_reader :good attr_reader :final_destination_uri_string + + # A new LinkChecker::Redirect object. + # + # @param params [Hash] A hash of parameters. Expects :final_destination_uri_string, + # which is the URL that the original :uri_string redirected to. def initialize(params) @final_destination_uri_string = params[:final_destination_uri_string] @good = params[:good] super(params) end end + # A bad result. The URL is not valid for some reason. Any reason, other than a 200 + # HTTP response. + # + # @param params [Hash] A hash of parameters. Expects :error, which is a string + # representing the error. class Error < Result attr_reader :error def initialize(params) @error = params[:error] super(params) end end private + # Checks the current :max_threads setting and blocks until the number of threads is + # below that number. def wait_to_spawn_thread # Never spawn more than the specified maximum number of threads. until Thread.list.select {|thread| thread.status == "run"}.count < (1 + @options[:max_threads]) do # Wait 5 milliseconds before trying again. \ No newline at end of file