lib/link_checker.rb in link-checker-0.2.0 vs lib/link_checker.rb in link-checker-0.3.0

- old
+ new

@@ -2,102 +2,143 @@ require 'nokogiri' require 'net/http' require 'net/https' require 'uri' require 'colorize' +require 'anemone' class LinkChecker - def initialize(target_path) - @target_path = target_path + def initialize(target) + @target = target end - def find_html_files - Find.find(@target_path).map {|path| + def html_file_paths + Find.find(@target).map {|path| FileTest.file?(path) && (path =~ /\.html?$/) ? path : nil }.reject{|path| path.nil?} end - def self.find_external_links(file_path) - Nokogiri::HTML(open(file_path)).css('a'). - select do |link| + def self.external_link_uri_strings(source) + Nokogiri::HTML(source).css('a').select {|link| !link.attribute('href').nil? && link.attribute('href').value =~ /^https?\:\/\// - end + }.map{|link| link.attributes['href'].value} end - def self.check_link(uri, redirected=false) - uri = URI.parse(uri) + def self.check_uri(uri, redirected=false) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = true if uri.scheme == "https" http.start do path = (uri.path.empty?) ? '/' : uri.path http.request_get(path) do |response| case response when Net::HTTPSuccess then if redirected - return Redirect.new(uri) + return Redirect.new(:final_destination_uri_string => uri.to_s) else - return Good.new + return Good.new(:uri_string => uri.to_s) end when Net::HTTPRedirection then - return self.check_link(response['location'], true) + return self.check_uri(URI(response['location']), true) else - raise Error.new(response) + return Error.new(:uri_string => uri.to_s, :response => response) end end end end - def check_links - find_html_files.each do |file| - bad_checks = [] - warnings = [] - self.class.find_external_links(file).each do |link| - uri = link.attribute('href').value + def check_uris + if @target =~ /^https?\:\/\// + check_uris_by_crawling + else + check_uris_in_files + end + end + + def check_uris_by_crawling + threads = [] + Anemone.crawl(@target) do |anemone| + anemone.storage = Anemone::Storage.PStore('link-checker-crawled-pages.pstore') + anemone.on_every_page do |crawled_page| + threads << start_link_check_thread(crawled_page.body, crawled_page.url.to_s) + end + end + threads.each{|thread| thread.join } + end + + def check_uris_in_files + threads = [] + html_file_paths.each do |file| + threads << start_link_check_thread(open(file), file) + end + threads.each{|thread| thread.join } + end + + def start_link_check_thread(source, source_name) + Thread.new do + results = self.class.external_link_uri_strings(source).map do |uri_string| begin - response = self.class.check_link(uri) - if response.class.eql? Redirect - warnings << { :link => link, :response => response } - end + uri = URI(uri_string) + response = self.class.check_uri(uri) + { :uri_string => uri_string, :response => response } rescue => error - bad_checks << { :link => link, :response => error } + { :uri_string => uri_string, :response => Error.new(:error => error.to_s) } end end + report_results(source_name, results) + end + end + def report_results(file, results) + bad_checks = results.select{|result| result[:response].class.eql? Error} + warnings = results.select{|result| result[:response].class.eql? Redirect} + Thread.exclusive do if bad_checks.empty? + message = "Checked: #{file}" if warnings.empty? - puts "Checked: #{file}".green + puts message.green else - puts "Checked: #{file}".yellow + puts message.yellow end warnings.each do |warning| - puts " Warning: #{warning[:link].attribute('href').value}".yellow - puts " Redirected to: #{warning[:response].final_destination.to_s}".yellow + puts " Warning: #{warning[:uri_string]}".yellow + puts " Redirected to: #{warning[:response].final_destination_uri_string}".yellow end else puts "Problem: #{file}".red bad_checks.each do |check| - puts " Link: #{check[:link].attribute('href').value}".red - puts " Response: #{check[:response].response.inspect}".red + puts " Link: #{check[:uri_string]}".red + puts " Response: #{check[:response].error.to_s}".red end end end end - class Good; end + class Result + attr_reader :uri_string + def initialize(params) + @uri_string = params[:uri_string] + end + end - class Redirect - attr_reader :final_destination - def initialize(final_destination) - @final_destination = final_destination + class Good < Result + end + + class Redirect < Result + attr_reader :good + attr_reader :final_destination_uri_string + def initialize(params) + @final_destination_uri_string = params[:final_destination_uri_string] + @good = params[:good] + super(params) end end - class Error < StandardError - attr_accessor :response - def initialize(response) - @response = response + class Error < Result + attr_reader :error + def initialize(params) + @error = params[:error] end end end \ No newline at end of file