lib/link_checker.rb in link-checker-0.6.0 vs lib/link_checker.rb in link-checker-0.7.1
- old
+ new
@@ -6,36 +6,52 @@
require 'colorize'
require 'anemone'
class LinkChecker
+ # Create a new instance of LinkChecker
+ #
+ # @param params [Hash] A hash containing the :target value, which can represent either
+ # a file path or a URL. And an optional :options value, which contains a hash with a
+ # list of possible optional paramters. This can include :no_warnings, :warnings_are_errors,
+ # or :max_threads
def initialize(params)
@options = params[:options] || { }
@target = params[:target] || './'
@html_files = []
@links = []
@errors = []
@warnings = []
@return_code = 0
- @options[:max_threads] ||= 100 # Only happens in testing.
+ @options[:max_threads] ||= 100
end
+ # Find a list of HTML files in the @target path, which was set in the {#initialize} method.
def html_file_paths
Find.find(@target).map {|path|
FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil
- }.reject{|path| path.nil?}
+ }.reject{|path| path.nil? }
end
+ # Find a list of all external links in the specified target, represented as URI strings.
+ #
+ # @param source [String] Either a file path or a URL.
+ # @return [Array] A list of URI strings.
def self.external_link_uri_strings(source)
Nokogiri::HTML(source).css('a').select {|link|
!link.attribute('href').nil? &&
link.attribute('href').value =~ /^https?\:\/\//
- }.map{|link| link.attributes['href'].value}
+ }.map{|link| link.attributes['href'].value }
end
+ # Check one URL.
+ #
+ # @param uri [URI] A URI object for the target URL.
+ # @return [LinkChecker::Result] One of the following objects: {LinkChecker::Good},
+ # {LinkChecker::Redirect}, or {LinkChecker::Error}.
def self.check_uri(uri, redirected=false)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if uri.scheme == "https"
http.start do
path = (uri.path.empty?) ? '/' : uri.path
@@ -54,10 +70,13 @@
end
end
end
end
+ # Check the URLs in the @target, either using {#check_uris_by_crawling} or
+ # {#check_uris_in_files}, depending on whether the @target looks like an http:// URL or
+ # a file path.
def check_uris
begin
if @target =~ /^https?\:\/\//
check_uris_by_crawling
else
@@ -82,58 +101,72 @@
end
@return_code
end
+ # Use {http://anemone.rubyforge.org Anemone} to crawl the pages at the @target URL,
+ # and then check all of the external URLs in those pages.
def check_uris_by_crawling
threads = []
Anemone.crawl(@target) do |anemone|
anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore')
anemone.on_every_page do |crawled_page|
raise StandardError.new(crawled_page.error) if crawled_page.error
- threads << start_link_check_thread(crawled_page.body, crawled_page.url.to_s)
+ threads << check_page(crawled_page.body, crawled_page.url.to_s)
@html_files << crawled_page
end
end
threads.each{|thread| thread.join }
end
+ # Treat the @target as a file path and find all HTML files under that path, and then
+ # scan all of the external URLs in those files.
def check_uris_in_files
threads = []
html_file_paths.each do |file|
wait_to_spawn_thread
- threads << start_link_check_thread(open(file), file)
+ threads << check_page(open(file), file)
@html_files << file
end
threads.each{|thread| thread.join }
end
- def start_link_check_thread(source, source_name)
+ # Spawn a thread to check an HTML page, and then spawn a thread for checking each
+ # link within that page.
+ #
+ # @param source [String] The contents of the HTML page, as a string.
+ # @param source_name [String] The name of the source, which will be reported if
+ # there is an error or a warning.
+ def check_page(page, page_name)
Thread.new do
threads = []
results = []
- self.class.external_link_uri_strings(source).each do |uri_string|
- Thread.exclusive { @links << source }
+ self.class.external_link_uri_strings(page).each do |uri_string|
+ Thread.exclusive { @links << page }
wait_to_spawn_thread
threads << Thread.new do
begin
uri = URI(uri_string)
response = self.class.check_uri(uri)
response.uri_string = uri_string
Thread.exclusive { results << response }
rescue => error
Thread.exclusive { results <<
- Error.new(:error => error.to_s, :uri_string => uri_string) }
+ Error.new( :error => error.to_s, :uri_string => uri_string) }
end
end
end
threads.each {|thread| thread.join }
- report_results(source_name, results)
+ report_results(page_name, results)
end
end
-
- def report_results(file, results)
+
+ # Report the results of scanning one HTML page.
+ #
+ # @param page_name [String] The name of the page.
+ # @param results [Array] An array of {LinkChecker::Result} objects.
+ def report_results(page_name, results)
errors = results.select{|result| result.class.eql? Error}
warnings = results.select{|result| result.class.eql? Redirect}
@return_code = 1 unless errors.empty?
if @options[:warnings_are_errors]
@return_code = 1 unless warnings.empty?
@@ -145,11 +178,11 @@
# This must be thread-exclusive to avoid a race condition.
@errors = @errors.concat(errors)
@warnings = @warnings.concat(warnings)
if errors.empty?
- message = "Checked: #{file}"
+ message = "Checked: #{page_name}"
if warnings.empty? || @options[:no_warnings]
puts message.green
else
puts message.yellow
end
@@ -158,11 +191,11 @@
puts " Warning: #{warning.uri_string}".yellow
puts " Redirected to: #{warning.final_destination_uri_string}".yellow
end
end
else
- puts "Problem: #{file}".red
+ puts "Problem: #{page_name}".red
errors.each do |error|
puts " Link: #{error.uri_string}".red
case error
when Redirect
puts " Redirected to: #{error.final_destination_uri_string}".red
@@ -172,39 +205,58 @@
end
end
end
end
+ # Abstract base class for representing the results of checking one URI.
class Result
attr_accessor :uri_string
+
+ # A new LinkChecker::Result object instance.
+ #
+ # @param params [Hash] A hash of parameters. Expects :uri_string.
def initialize(params)
@uri_string = params[:uri_string]
end
end
+ # A good result. The URL is valid.
class Good < Result
end
+ # A redirection to another URL.
class Redirect < Result
attr_reader :good
attr_reader :final_destination_uri_string
+
+ # A new LinkChecker::Redirect object.
+ #
+ # @param params [Hash] A hash of parameters. Expects :final_destination_uri_string,
+ # which is the URL that the original :uri_string redirected to.
def initialize(params)
@final_destination_uri_string = params[:final_destination_uri_string]
@good = params[:good]
super(params)
end
end
+ # A bad result. The URL is not valid for some reason. Any reason, other than a 200
+ # HTTP response.
+ #
+ # @param params [Hash] A hash of parameters. Expects :error, which is a string
+ # representing the error.
class Error < Result
attr_reader :error
def initialize(params)
@error = params[:error]
super(params)
end
end
private
+ # Checks the current :max_threads setting and blocks until the number of threads is
+ # below that number.
def wait_to_spawn_thread
# Never spawn more than the specified maximum number of threads.
until Thread.list.select {|thread| thread.status == "run"}.count <
(1 + @options[:max_threads]) do
# Wait 5 milliseconds before trying again.
\ No newline at end of file