lib/big_sitemap.rb in alexrabarts-big_sitemap-0.2.1 vs lib/big_sitemap.rb in alexrabarts-big_sitemap-0.3.0
- old
+ new
@@ -1,209 +1,248 @@
-require 'net/http'
require 'uri'
require 'zlib'
require 'builder'
require 'extlib'
class BigSitemap
+ DEFAULTS = {
+ :max_per_sitemap => 50000,
+ :batch_size => 1001,
+ :path => 'sitemaps',
+ :gzip => true,
+
+ # opinionated
+ :ping_google => true,
+ :ping_yahoo => false, # needs :yahoo_app_id
+ :ping_msn => false,
+ :ping_ask => false
+ }
+
+ COUNT_METHODS = [:count_for_sitemap, :count]
+ FIND_METHODS = [:find_for_sitemap, :all]
+ TIMESTAMP_METHODS = [:updated_at, :updated_on, :updated, :created_at, :created_on, :created]
+ PARAM_METHODS = [:to_param, :id]
+
+ include ActionController::UrlWriter if defined? Rails
+
def initialize(options)
- document_root = options.delete(:document_root)
+ @options = DEFAULTS.merge options
- if document_root.nil?
- if defined? RAILS_ROOT
- document_root = "#{RAILS_ROOT}/public"
+ # Use Rails' default_url_options if available
+ @default_url_options = defined?(Rails) ? default_url_options : {}
+
+ if @options[:url_options]
+ @default_url_options.update @options[:url_options]
+ elsif @options[:base_url]
+ uri = URI.parse(@options[:base_url])
+ @default_url_options[:host] = uri.host
+ @default_url_options[:port] = uri.port
+ @default_url_options[:protocol] = uri.scheme
+ else
+ raise ArgumentError, 'you must specify either ":url_options" hash or ":base_url" string'
+ end
+
+ if @options[:batch_size] > @options[:max_per_sitemap]
+ raise ArgumentError, '":batch_size" must be less than ":max_per_sitemap"'
+ end
+
+ @options[:document_root] ||= begin
+ if defined? Rails
+ "#{Rails.root}/public"
elsif defined? Merb
- document_root = "#{Merb.root}/public"
+ "#{Merb.root}/public"
end
end
- raise ArgumentError, 'Document root must be specified with the :document_root option' if document_root.nil?
+ unless @options[:document_root]
+ raise ArgumentError, 'Document root must be specified with the ":document_root" option'
+ end
- @base_url = options.delete(:base_url)
- @max_per_sitemap = options.delete(:max_per_sitemap) || 50000
- @batch_size = options.delete(:batch_size) || 1001 # TODO: Set this to 1000 once DM offset 37000 bug is fixed
- @web_path = strip_leading_slash(options.delete(:path) || 'sitemaps')
- @ping_google = options[:ping_google].nil? ? true : options.delete(:ping_google)
- @ping_yahoo = options[:ping_yahoo].nil? ? true : options.delete(:ping_yahoo)
- @yahoo_app_id = options.delete(:yahoo_app_id)
- @ping_msn = options[:ping_msn].nil? ? true : options.delete(:ping_msn)
- @ping_ask = options[:ping_ask].nil? ? true : options.delete(:ping_ask)
- @file_path = "#{document_root}/#{@web_path}"
- @sources = []
-
- raise ArgumentError, "Base URL must be specified with the :base_url option" if @base_url.nil?
-
- raise(
- ArgumentError,
- 'Batch size (:batch_size) must be less than or equal to maximum URLs per sitemap (:max_per_sitemap)'
- ) if @batch_size > @max_per_sitemap
-
+ @file_path = "#{@options[:document_root]}/#{strip_leading_slash(@options[:path])}"
Dir.mkdir(@file_path) unless File.exists? @file_path
+
+ @sources = []
+ @sitemap_files = []
end
- def add(options)
- raise ArgumentError, ':model and :path options must be provided' unless options[:model] && options[:path]
- @sources << options.update(:path => strip_leading_slash(options[:path]))
- self # Chainable
+ def add(model, options={})
+ options[:path] ||= Extlib::Inflection.tableize(model.to_s)
+ @sources << [model, options.dup]
+ return self
end
def clean
- unless @file_path.nil?
- Dir.foreach(@file_path) do |f|
- f = "#{@file_path}/#{f}"
- File.delete(f) if File.file?(f)
- end
+ Dir["#{@file_path}/sitemap_*.{xml,xml.gz}"].each do |file|
+ FileUtils.rm file
end
- self # Chainable
+ return self
end
def generate
- @sources.each do |source|
- klass = source[:model]
+ for model, options in @sources
+ count_method = pick_method(model, COUNT_METHODS)
+ find_method = pick_method(model, FIND_METHODS)
+ raise ArgumentError, "#{model} must provide a count_for_sitemap class method" if count_method.nil?
+ raise ArgumentError, "#{model} must provide a find_for_sitemap class method" if find_method.nil?
- count_method = pick_method(klass, [:count_for_sitemap, :count])
- find_method = pick_method(klass, [:find_for_sitemap, :all])
- raise ArgumentError, "#{klass} must provide a count_for_sitemap class method" if count_method.nil?
- raise ArgumentError, "#{klass} must provide a find_for_sitemap class method" if find_method.nil?
-
- count = klass.send(count_method)
+ count = model.send(count_method)
num_sitemaps = 1
num_batches = 1
- if count > @batch_size
- num_batches = (count.to_f / @batch_size.to_f).ceil
- num_sitemaps = (count.to_f / @max_per_sitemap.to_f).ceil
+ if count > @options[:batch_size]
+ num_batches = (count.to_f / @options[:batch_size].to_f).ceil
+ num_sitemaps = (count.to_f / @options[:max_per_sitemap].to_f).ceil
end
batches_per_sitemap = num_batches.to_f / num_sitemaps.to_f
- # Update the @sources hash so that the index file knows how many sitemaps to link to
- source[:num_sitemaps] = num_sitemaps
+ find_options = options.dup
for sitemap_num in 1..num_sitemaps
# Work out the start and end batch numbers for this sitemap
batch_num_start = sitemap_num == 1 ? 1 : ((sitemap_num * batches_per_sitemap).ceil - batches_per_sitemap + 1).to_i
batch_num_end = (batch_num_start + [batches_per_sitemap, num_batches].min).floor - 1
# Stream XML output to a file
- filename = "sitemap_#{Extlib::Inflection::underscore(klass.to_s)}"
+ filename = "sitemap_#{Extlib::Inflection::tableize(model.to_s)}"
filename << "_#{sitemap_num}" if num_sitemaps > 1
- gz = gz_writer("#{filename}.xml.gz")
+ f = xml_open(filename)
- xml = Builder::XmlMarkup.new(:target => gz)
+ xml = Builder::XmlMarkup.new(:target => f)
xml.instruct!
xml.urlset(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
for batch_num in batch_num_start..batch_num_end
- offset = ((batch_num - 1) * @batch_size)
- limit = (count - offset) < @batch_size ? (count - offset - 1) : @batch_size
- find_options = num_batches > 1 ? {:limit => limit, :offset => offset} : {}
+ offset = ((batch_num - 1) * @options[:batch_size])
+ limit = (count - offset) < @options[:batch_size] ? (count - offset - 1) : @options[:batch_size]
+ find_options.update(:limit => limit, :offset => offset) if num_batches > 1
- klass.send(find_method, find_options).each do |r|
- last_mod_method = pick_method(
- r,
- [:updated_at, :updated_on, :updated, :created_at, :created_on, :created]
- )
+ model.send(find_method, find_options).each do |r|
+ last_mod_method = pick_method(r, TIMESTAMP_METHODS)
last_mod = last_mod_method.nil? ? Time.now : r.send(last_mod_method)
- param_method = pick_method(r, [:to_param, :id])
- raise ArgumentError, "#{klass} must provide a to_param instance method" if param_method.nil?
+ param_method = pick_method(r, PARAM_METHODS)
xml.url do
- xml.loc("#{@base_url}/#{source[:path]}/#{r.send(param_method)}")
+ location = defined?(Rails) ?
+ polymorphic_url(r) :
+ "#{root_url}/#{strip_leading_slash(options[:path])}/#{r.send(param_method)}"
+ xml.loc(location)
+
xml.lastmod(last_mod.strftime('%Y-%m-%d')) unless last_mod.nil?
- xml.changefreq('weekly')
+
+ change_frequency = options[:change_frequency] || 'weekly'
+ xml.changefreq(change_frequency.is_a?(Proc) ? change_frequency.call(r) : change_frequency)
+
+ priority = options[:priority]
+ unless priority.nil?
+ xml.priority(priority.is_a?(Proc) ? priority.call(r) : priority)
+ end
end
end
end
end
- gz.close
+ f.close
end
end
generate_sitemap_index
- ping_search_engines
- self # Chainable
+
+ return self
end
- private
- def strip_leading_slash(str)
- str.sub(/^\//, '')
+ def ping_search_engines
+ require 'net/http'
+ require 'cgi'
+
+ sitemap_uri = CGI::escape(url_for_sitemap(@sitemap_files.last))
+
+ if @options[:ping_google]
+ Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}")
end
- def pick_method(klass, candidates)
- method = nil
- candidates.each do |candidate|
- if klass.respond_to? candidate
- method = candidate
- break
- end
+ if @options[:ping_yahoo]
+ if @options[:yahoo_app_id]
+ Net::HTTP.get(
+ 'search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?" +
+ "appid=#{@options[:yahoo_app_id]}&url=#{sitemap_uri}"
+ )
+ else
+ $stderr.puts 'unable to ping Yahoo: no ":yahoo_app_id" provided'
end
- method
end
- def gz_writer(filename)
- Zlib::GzipWriter.new(File.open("#{@file_path}/#{filename}", 'w+'))
+ if @options[:ping_msn]
+ Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}")
end
- def sitemap_index_filename
- 'sitemap_index.xml.gz'
+ if @options[:pink_ask]
+ Net::HTTP.get('submissions.ask.com', "/ping?sitemap=#{sitemap_uri}")
end
+ end
- # Create a sitemap index document
- def generate_sitemap_index
- xml = ''
- builder = Builder::XmlMarkup.new(:target => xml)
- builder.instruct!
- builder.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
- @sources.each do |source|
- num_sitemaps = source[:num_sitemaps]
- for i in 1..num_sitemaps
- loc = "#{@base_url}/#{@web_path}/sitemap_#{Extlib::Inflection::underscore(source[:model].to_s)}"
- loc << "_#{i}" if num_sitemaps > 1
- loc << '.xml.gz'
+ def root_url
+ @root_url ||= begin
+ url = ''
+ url << (@default_url_options[:protocol] || 'http')
+ url << '://' unless url.match('://')
+ url << @default_url_options[:host]
+ url << ":#{port}" if port = @default_url_options[:port] and port != 80
+ end
+ end
- builder.sitemap do
- builder.loc(loc)
- builder.lastmod(Time.now.strftime('%Y-%m-%d'))
- end
- end
- end
- end
+ private
- gz = gz_writer(sitemap_index_filename)
- gz.write(xml)
- gz.close
- end
+ def strip_leading_slash(str)
+ str.sub(/^\//, '')
+ end
- def sitemap_uri
- URI.escape("#{@base_url}/#{@web_path}/#{sitemap_index_filename}")
+ def pick_method(model, candidates)
+ method = nil
+ candidates.each do |candidate|
+ if model.respond_to? candidate
+ method = candidate
+ break
+ end
end
+ method
+ end
- # Notify Google of the new sitemap index file
- def ping_google
- Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}")
- end
+ def xml_open(filename)
+ filename << '.xml'
+ filename << '.gz' if @options[:gzip]
- # Notify Yahoo! of the new sitemap index file
- def ping_yahoo
- Net::HTTP.get('search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?appid=#{@yahoo_app_id}&url=#{sitemap_uri}")
- end
+ file = File.open("#{@file_path}/#{filename}", 'w+')
- # Notify MSN of the new sitemap index file
- def ping_msn
- Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}")
- end
+ @sitemap_files << file.path
- # Notify Ask of the new sitemap index file
- def ping_ask
- Net::HTTP.get('submissions.ask.com', "/ping?sitemap=#{sitemap_uri}")
+ writer = @options[:gzip] ? Zlib::GzipWriter.new(file) : file
+
+ if block_given?
+ yield writer
+ writer.close
end
- def ping_search_engines
- ping_google if @ping_google
- ping_yahoo if @ping_yahoo && @yahoo_app_id
- ping_msn if @ping_msn
- ping_ask if @ping_ask
+ writer
+ end
+
+ def url_for_sitemap(path)
+ "#{root_url}/#{File.basename(path)}"
+ end
+
+ # Create a sitemap index document
+ def generate_sitemap_index
+ xml_open 'sitemap_index' do |file|
+ xml = Builder::XmlMarkup.new(:target => file)
+ xml.instruct!
+ xml.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
+ for path in @sitemap_files[0..-2]
+ xml.sitemap do
+ xml.loc(url_for_sitemap(path))
+ xml.lastmod(Time.now.strftime('%Y-%m-%d'))
+ end
+ end
+ end
end
-end
\ No newline at end of file
+ end
+end