require 'enumerator'
require 'net/http'
require 'uri'
class SEOException < Exception
end
class SEOChecker
def initialize(url, options={})
@url = url
@locations = []
@titles = {}
@descriptions = {}
@errors = []
@batch_size = options[:batch_size].to_i
@interval_time = options[:interval_time].to_i || 0
end
def check
begin
check_sitemap
check_location
report
rescue SEOException => e
puts e.message
end
end
def check_sitemap
#TODO: allow manual sitemap file
uri = URI.parse(@url)
uri.path = '/sitemap.xml'
response = get_response(uri)
if response.is_a? Net::HTTPSuccess
@locations = response.body.scan(%r{(.*?)}).flatten
else
raise SEOException, "Error: There is no sitemap.xml."
end
end
def check_location
@batch_size ||= @locations.size
@locations.each_slice(@batch_size) do |batch_locations|
batch_locations.each do |location|
response = get_response(URI.parse(location))
if response.is_a? Net::HTTPSuccess
check_title(response, location)
check_description(response, location)
check_url(location)
else
@errors << "The page is unreachable #{location}."
end
end
sleep(@interval_time)
end
end
def report
@titles.each do |title, locations|
if locations.size > 1
@errors << "#{locations.slice(0, 5).join(', ')} #{'and ...' if locations.size > 5} have the same title '#{title}'."
end
end
@descriptions.each do |description, locations|
if locations.size > 1
@errors << "#{locations.slice(0, 5).join(', ')} #{'and ...' if locations.size > 5} have the same description '#{description}'."
end
end
puts @errors.join("\n")
end
private
def get_response(uri)
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Get.new(uri.request_uri)
request["User-Agent"] = "seo-checker"
response = http.request(request)
end
def check_title(response, location)
if response.body =~ %r{
(.*?)}
title = $1
else
@errors << "#{location} has no title."
end
(@titles[title] ||= []) << location
end
def check_description(response, location)
if response.body =~ %r{|}
description = $1 || $2
else
@errors << "#{location} has no description."
end
(@descriptions[description] ||= []) << location
end
def check_url(location)
items = location.split('/')
if items.find { |item| item =~ /^\d+$/ } || items.last =~ /^\d+\.htm(l)?/
@errors << "#{location} should not just use ID number in URL."
end
if items.find { |item| item.split('-').size > 5 }
@errors << "#{location} use excessive keywords"
end
if items.size > 8
@errors << "#{location} has deep nesting of subdirectories"
end
end
end