lib/link_checker.rb in link-checker-0.5.2 vs lib/link_checker.rb in link-checker-0.6.0
- old
+ new
@@ -7,13 +7,20 @@
require 'anemone'
class LinkChecker
def initialize(params)
- @options = params[:options] || {}
+ @options = params[:options] || { }
@target = params[:target] || './'
+
+ @html_files = []
+ @links = []
+ @errors = []
+ @warnings = []
@return_code = 0
+
+ @options[:max_threads] ||= 100 # Only happens in testing.
end
def html_file_paths
Find.find(@target).map {|path|
FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil
@@ -41,11 +48,11 @@
return Good.new(:uri_string => uri.to_s)
end
when Net::HTTPRedirection then
return self.check_uri(URI(response['location']), true)
else
- return Error.new(:uri_string => uri.to_s, :response => response)
+ return Error.new(:uri_string => uri.to_s, :error => response)
end
end
end
end
@@ -57,45 +64,71 @@
check_uris_in_files
end
rescue => error
puts "Error: #{error.to_s}".red
end
+
+ # Report the final results.
+ unless @html_files.empty?
+ file_pluralized = (@html_files.size.eql? 1) ? 'file' : 'files'
+ link_pluralized = (@links.size.eql? 1) ? 'link' : 'links'
+ if @errors.empty?
+ puts ("Checked #{@links.size} #{link_pluralized} in #{@html_files.size} " +
+ "HTML #{file_pluralized} and found no errors.").green
+ else
+ error_pluralized = (@errors.size.eql? 1) ? 'error' : 'errors'
+ puts ("Checked #{@links.size} #{link_pluralized} in #{@html_files.size} " +
+ "HTML #{file_pluralized} and found #{@errors.size} #{error_pluralized}.").red
+ end
+ end
+
@return_code
end
def check_uris_by_crawling
threads = []
Anemone.crawl(@target) do |anemone|
anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore')
anemone.on_every_page do |crawled_page|
raise StandardError.new(crawled_page.error) if crawled_page.error
threads << start_link_check_thread(crawled_page.body, crawled_page.url.to_s)
+ @html_files << crawled_page
end
end
threads.each{|thread| thread.join }
end
def check_uris_in_files
threads = []
html_file_paths.each do |file|
+ wait_to_spawn_thread
threads << start_link_check_thread(open(file), file)
+ @html_files << file
end
threads.each{|thread| thread.join }
end
def start_link_check_thread(source, source_name)
Thread.new do
- results = self.class.external_link_uri_strings(source).map do |uri_string|
- begin
- uri = URI(uri_string)
- response = self.class.check_uri(uri)
- response.uri_string = uri_string
- response
- rescue => error
- Error.new(:error => error.to_s, :uri_string => uri_string)
+ threads = []
+ results = []
+ self.class.external_link_uri_strings(source).each do |uri_string|
+ Thread.exclusive { @links << source }
+ wait_to_spawn_thread
+ threads << Thread.new do
+ begin
+ uri = URI(uri_string)
+ response = self.class.check_uri(uri)
+ response.uri_string = uri_string
+ Thread.exclusive { results << response }
+ rescue => error
+ Thread.exclusive { results <<
+ Error.new(:error => error.to_s, :uri_string => uri_string) }
+ end
end
end
+ threads.each {|thread| thread.join }
report_results(source_name, results)
end
end
def report_results(file, results)
@@ -106,10 +139,15 @@
@return_code = 1 unless warnings.empty?
errors = errors + warnings
warnings = []
end
Thread.exclusive do
+ # Store the results in the LinkChecker instance.
+ # This must be thread-exclusive to avoid a race condition.
+ @errors = @errors.concat(errors)
+ @warnings = @warnings.concat(warnings)
+
if errors.empty?
message = "Checked: #{file}"
if warnings.empty? || @options[:no_warnings]
puts message.green
else
@@ -159,9 +197,20 @@
class Error < Result
attr_reader :error
def initialize(params)
@error = params[:error]
super(params)
+ end
+ end
+
+ private
+
+ def wait_to_spawn_thread
+ # Never spawn more than the specified maximum number of threads.
+ until Thread.list.select {|thread| thread.status == "run"}.count <
+ (1 + @options[:max_threads]) do
+ # Wait 5 milliseconds before trying again.
+ sleep 0.005
end
end
end
\ No newline at end of file