# encoding: utf-8
module Nanoc3::Extra::Validators
# A validator that verifies that all links (`…`) point to a
# location that exists.
class Links
# @param [String] dir The directory that will be searched for HTML files
# to validate
#
# @param [Array] index_filenames An array of index filenames that
# will be appended to URLs by web servers if a directory is requested
# instead of a file
#
# @option params [Boolean] :internal (false) True if internal links should
# be checked; false if they should not
#
# @option params [Boolean] :external (false) True if external links should
# be checked; false if they should not
def initialize(dir, index_filenames, params={})
@dir = dir
@index_filenames = index_filenames
@include_internal = params.has_key?(:internal) && params[:internal]
@include_external = params.has_key?(:external) && params[:external]
end
# Starts the validator. The results will be printed to stdout.
#
# @return [void]
def run
require 'nokogiri'
@delegate = self
links = all_broken_hrefs
if links.empty?
puts "No broken links found!"
else
links.each_pair do |href, origins|
puts "Broken link: #{href} -- referenced from:"
origins.each do |origin|
puts " #{origin}"
end
puts
end
end
end
private
# Enumerates all key-value pairs of a given hash in a thread-safe way.
#
# This class is a helper class, which means that it is not used directly
# by nanoc. Future versions of nanoc may no longer contain this class. Do
# not depend on this class to be available.
class ThreadsafeHashEnumerator
# Creates a new enumerator for the given hash.
#
# @param [Hash] hash The hash for which the enumerator should return
# key-value pairs
def initialize(hash)
@hash = hash
@unprocessed_keys = @hash.keys.dup
@mutex = Mutex.new
end
# Returns the next key-value pair in the hash.
#
# @return [Array] An array containing the key and the corresponding
# value of teh next key-value pair
def next_pair
@mutex.synchronize do
key = @unprocessed_keys.shift
return (key ? [ key, @hash[key] ] : nil)
end
end
end
def all_broken_hrefs
broken_hrefs = {}
internal_hrefs = {}
external_hrefs = {}
# Split into internal and external hrefs
all_hrefs_per_filename.each_pair do |filename, hrefs|
hrefs.each do |href|
if is_external_href?(href)
external_hrefs[href] ||= []
external_hrefs[href] << filename
else
internal_hrefs[href] ||= []
internal_hrefs[href] << filename
end
end
end
# Validate hrefs
validate_internal_hrefs(internal_hrefs, broken_hrefs) if @include_internal
validate_external_hrefs(external_hrefs, broken_hrefs) if @include_external
# Done
broken_hrefs
end
def all_files
Dir[@dir + '/**/*.html']
end
def all_hrefs_per_filename
hrefs = {}
all_files.each do |filename|
hrefs[filename] ||= all_hrefs_in_file(filename)
end
hrefs
end
def all_hrefs_in_file(filename)
doc = Nokogiri::HTML(::File.read(filename))
doc.css('a').map { |l| l[:href] }.compact
end
def is_external_href?(href)
!!(href =~ %r{^[a-z\-]+:})
end
def is_valid_internal_href?(href, origin)
# Skip hrefs that point to self
# FIXME this is ugly and won’t always be correct
return true if href == '.'
# Remove target
path = href.sub(/#.*$/, '')
return true if path.empty?
# Make absolute
if path[0, 1] == '/'
path = @dir + path
else
path = ::File.expand_path(path, ::File.dirname(origin))
end
# Check whether file exists
return true if File.file?(path)
# Check whether directory with index file exists
return true if File.directory?(path) && @index_filenames.any? { |fn| File.file?(File.join(path, fn)) }
# Nope :(
return false
end
def is_valid_external_href?(href)
require 'net/http'
require 'uri'
# Parse
uri = nil
begin
uri = URI.parse(href)
rescue URI::InvalidURIError
@delegate && @delegate.send(:external_href_validated, href, false)
return false
end
# Skip non-HTTP URLs
return true if uri.scheme !~ /^https?$/
# Get status
status = fetch_http_status_for(uri)
is_valid = !!(status && status >= 200 && status <= 299)
# Notify
@delegate && @delegate.send(:external_href_validated, href, is_valid)
# Done
is_valid
end
def validate_internal_hrefs(hrefs, broken_hrefs)
hrefs.each_pair do |href, filenames|
filenames.each do |filename|
if !is_valid_internal_href?(href, filename)
broken_hrefs[href] = filenames
end
end
end
end
def validate_external_hrefs(hrefs, broken_hrefs)
@mutex = Mutex.new
enum = ThreadsafeHashEnumerator.new(hrefs)
threads = []
10.times do
threads << Thread.new do
loop do
# Get next pair
pair = enum.next_pair
break if pair.nil?
href, filenames = pair[0], pair[1]
# Validate
if !is_valid_external_href?(href)
@mutex.synchronize do
broken_hrefs[href] = filenames
end
end
end
end
end
threads.each { |t| t.join }
end
def fetch_http_status_for(url, params={})
5.times do |i|
begin
res = nil
Timeout::timeout(10) do
res = request_url_once(url)
end
if res.code =~ /^3..$/
url = URI.parse(res['location'])
return nil if i == 5
else
return res.code.to_i
end
rescue
return nil
end
end
end
def request_url_once(url)
path = (url.path.nil? || url.path.empty? ? '/' : url.path)
req = Net::HTTP::Head.new(path)
res = Net::HTTP.start(url.host, url.port) { |h| h.request(req) }
res
end
def external_href_validated(href, is_valid)
texts = {
true => 'ok',
false => ' ERROR '
}
colors = {
true => "\e[32m",
false => "\e[41m\e[37m",
:off => "\033[0m"
}
@mutex.synchronize do
puts href + ': ' + colors[is_valid] + texts[is_valid] + colors[:off]
end
end
end
end