lib/cohesion.rb in cohesion-1.0.0 vs lib/cohesion.rb in cohesion-1.0.1
- old
+ new
@@ -1,8 +1,10 @@
+require 'bundler/setup'
require "cohesion/version"
require 'cobweb'
require 'ptools'
+require 'digest/md5'
require 'cohesion/railtie' if defined?(Rails)
module Cohesion
class Check
@@ -67,21 +69,33 @@
def self.site(url, options={})
errors = []
failures = []
+ pages = {}
+
options[:cache] = options[:cache].to_i if options[:cache]
crawler_options = {:cache_type => :full, :crawl_linked_external => true, :store_inbound_links => true}.merge(options)
- puts crawler_options
statistics = CobwebCrawler.new(crawler_options).crawl(url) do |page|
print page[:url]
+
+ duplicate = !pages[Digest::MD5.hexdigest(page[:body])].nil?
+ pages[Digest::MD5.hexdigest(page[:body])] = [] unless pages[Digest::MD5.hexdigest(page[:body])]
+ pages[Digest::MD5.hexdigest(page[:body])] << page[:url]
+
+ # if it was a 404 before, just check again not using the cache this time
if page[:status_code] == 404
page = Cobweb.new(crawler_options.merge(:cache => nil)).get(page[:url])
end
- if page[:status_code] > 399
- puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
+
+ if page[:status_code] == 404 || duplicate
+ if duplicate
+ puts " [duplicate] \e[31m\u2717\e[0m"
+ else
+ puts " [#{page[:status_code]}] \e[31m\u2717\e[0m"
+ end
failures << page
else
puts " \e[32m\u2713\e[0m"
end
end
@@ -110,10 +124,20 @@
puts " - #{inbound_link}"
end
end
puts ""
+ puts "Duplicate Content"
+ puts ""
+ pages.select{|k,v| v.count > 1}.each do |k,v|
+ puts "Duplicate: #{k}"
+ v.map{|x| puts " - #{x}" }
+ end
+
+
+ puts ""
puts "Total Failed URLs: #{total_failures}"
+ puts "Total Duplicates: #{pages.map{|d| d[1]}.select{|d| d.count > 1}.inject{|total, d| total + d.count}.count}"
puts "Total Inbound Failures (Pages linking to a 404): #{total_inbound_failures}"
puts ""
end
puts