lib/link_checker.rb in link-checker-0.2.0 vs lib/link_checker.rb in link-checker-0.3.0
- old
+ new
@@ -2,102 +2,143 @@
require 'nokogiri'
require 'net/http'
require 'net/https'
require 'uri'
require 'colorize'
+require 'anemone'
class LinkChecker
- def initialize(target_path)
- @target_path = target_path
+ def initialize(target)
+ @target = target
end
- def find_html_files
- Find.find(@target_path).map {|path|
+ def html_file_paths
+ Find.find(@target).map {|path|
FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil
}.reject{|path| path.nil?}
end
- def self.find_external_links(file_path)
- Nokogiri::HTML(open(file_path)).css('a').
- select do |link|
+ def self.external_link_uri_strings(source)
+ Nokogiri::HTML(source).css('a').select {|link|
!link.attribute('href').nil? &&
link.attribute('href').value =~ /^https?\:\/\//
- end
+ }.map{|link| link.attributes['href'].value}
end
- def self.check_link(uri, redirected=false)
- uri = URI.parse(uri)
+ def self.check_uri(uri, redirected=false)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if uri.scheme == "https"
http.start do
path = (uri.path.empty?) ? '/' : uri.path
http.request_get(path) do |response|
case response
when Net::HTTPSuccess then
if redirected
- return Redirect.new(uri)
+ return Redirect.new(:final_destination_uri_string => uri.to_s)
else
- return Good.new
+ return Good.new(:uri_string => uri.to_s)
end
when Net::HTTPRedirection then
- return self.check_link(response['location'], true)
+ return self.check_uri(URI(response['location']), true)
else
- raise Error.new(response)
+ return Error.new(:uri_string => uri.to_s, :response => response)
end
end
end
end
- def check_links
- find_html_files.each do |file|
- bad_checks = []
- warnings = []
- self.class.find_external_links(file).each do |link|
- uri = link.attribute('href').value
+ def check_uris
+ if @target =~ /^https?\:\/\//
+ check_uris_by_crawling
+ else
+ check_uris_in_files
+ end
+ end
+
+ def check_uris_by_crawling
+ threads = []
+ Anemone.crawl(@target) do |anemone|
+ anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore')
+ anemone.on_every_page do |crawled_page|
+ threads << start_link_check_thread(crawled_page.body, crawled_page.url.to_s)
+ end
+ end
+ threads.each{|thread| thread.join }
+ end
+
+ def check_uris_in_files
+ threads = []
+ html_file_paths.each do |file|
+ threads << start_link_check_thread(open(file), file)
+ end
+ threads.each{|thread| thread.join }
+ end
+
+ def start_link_check_thread(source, source_name)
+ Thread.new do
+ results = self.class.external_link_uri_strings(source).map do |uri_string|
begin
- response = self.class.check_link(uri)
- if response.class.eql? Redirect
- warnings << { :link => link, :response => response }
- end
+ uri = URI(uri_string)
+ response = self.class.check_uri(uri)
+ { :uri_string => uri_string, :response => response }
rescue => error
- bad_checks << { :link => link, :response => error }
+ { :uri_string => uri_string, :response => Error.new(:error => error.to_s) }
end
end
+ report_results(source_name, results)
+ end
+ end
+ def report_results(file, results)
+ bad_checks = results.select{|result| result[:response].class.eql? Error}
+ warnings = results.select{|result| result[:response].class.eql? Redirect}
+ Thread.exclusive do
if bad_checks.empty?
+ message = "Checked: #{file}"
if warnings.empty?
- puts "Checked: #{file}".green
+ puts message.green
else
- puts "Checked: #{file}".yellow
+ puts message.yellow
end
warnings.each do |warning|
- puts " Warning: #{warning[:link].attribute('href').value}".yellow
- puts " Redirected to: #{warning[:response].final_destination.to_s}".yellow
+ puts " Warning: #{warning[:uri_string]}".yellow
+ puts " Redirected to: #{warning[:response].final_destination_uri_string}".yellow
end
else
puts "Problem: #{file}".red
bad_checks.each do |check|
- puts " Link: #{check[:link].attribute('href').value}".red
- puts " Response: #{check[:response].response.inspect}".red
+ puts " Link: #{check[:uri_string]}".red
+ puts " Response: #{check[:response].error.to_s}".red
end
end
end
end
- class Good; end
+ class Result
+ attr_reader :uri_string
+ def initialize(params)
+ @uri_string = params[:uri_string]
+ end
+ end
- class Redirect
- attr_reader :final_destination
- def initialize(final_destination)
- @final_destination = final_destination
+ class Good < Result
+ end
+
+ class Redirect < Result
+ attr_reader :good
+ attr_reader :final_destination_uri_string
+ def initialize(params)
+ @final_destination_uri_string = params[:final_destination_uri_string]
+ @good = params[:good]
+ super(params)
end
end
- class Error < StandardError
- attr_accessor :response
- def initialize(response)
- @response = response
+ class Error < Result
+ attr_reader :error
+ def initialize(params)
+ @error = params[:error]
end
end
end
\ No newline at end of file