lib/big_sitemap.rb in alexrabarts-big_sitemap-0.2.1 vs lib/big_sitemap.rb in alexrabarts-big_sitemap-0.3.0

- old
+ new

@@ -1,209 +1,248 @@ -require 'net/http' require 'uri' require 'zlib' require 'builder' require 'extlib' class BigSitemap + DEFAULTS = { + :max_per_sitemap => 50000, + :batch_size => 1001, + :path => 'sitemaps', + :gzip => true, + + # opinionated + :ping_google => true, + :ping_yahoo => false, # needs :yahoo_app_id + :ping_msn => false, + :ping_ask => false + } + + COUNT_METHODS = [:count_for_sitemap, :count] + FIND_METHODS = [:find_for_sitemap, :all] + TIMESTAMP_METHODS = [:updated_at, :updated_on, :updated, :created_at, :created_on, :created] + PARAM_METHODS = [:to_param, :id] + + include ActionController::UrlWriter if defined? Rails + def initialize(options) - document_root = options.delete(:document_root) + @options = DEFAULTS.merge options - if document_root.nil? - if defined? RAILS_ROOT - document_root = "#{RAILS_ROOT}/public" + # Use Rails' default_url_options if available + @default_url_options = defined?(Rails) ? default_url_options : {} + + if @options[:url_options] + @default_url_options.update @options[:url_options] + elsif @options[:base_url] + uri = URI.parse(@options[:base_url]) + @default_url_options[:host] = uri.host + @default_url_options[:port] = uri.port + @default_url_options[:protocol] = uri.scheme + else + raise ArgumentError, 'you must specify either ":url_options" hash or ":base_url" string' + end + + if @options[:batch_size] > @options[:max_per_sitemap] + raise ArgumentError, '":batch_size" must be less than ":max_per_sitemap"' + end + + @options[:document_root] ||= begin + if defined? Rails + "#{Rails.root}/public" elsif defined? Merb - document_root = "#{Merb.root}/public" + "#{Merb.root}/public" end end - raise ArgumentError, 'Document root must be specified with the :document_root option' if document_root.nil? + unless @options[:document_root] + raise ArgumentError, 'Document root must be specified with the ":document_root" option' + end - @base_url = options.delete(:base_url) - @max_per_sitemap = options.delete(:max_per_sitemap) || 50000 - @batch_size = options.delete(:batch_size) || 1001 # TODO: Set this to 1000 once DM offset 37000 bug is fixed - @web_path = strip_leading_slash(options.delete(:path) || 'sitemaps') - @ping_google = options[:ping_google].nil? ? true : options.delete(:ping_google) - @ping_yahoo = options[:ping_yahoo].nil? ? true : options.delete(:ping_yahoo) - @yahoo_app_id = options.delete(:yahoo_app_id) - @ping_msn = options[:ping_msn].nil? ? true : options.delete(:ping_msn) - @ping_ask = options[:ping_ask].nil? ? true : options.delete(:ping_ask) - @file_path = "#{document_root}/#{@web_path}" - @sources = [] - - raise ArgumentError, "Base URL must be specified with the :base_url option" if @base_url.nil? - - raise( - ArgumentError, - 'Batch size (:batch_size) must be less than or equal to maximum URLs per sitemap (:max_per_sitemap)' - ) if @batch_size > @max_per_sitemap - + @file_path = "#{@options[:document_root]}/#{strip_leading_slash(@options[:path])}" Dir.mkdir(@file_path) unless File.exists? @file_path + + @sources = [] + @sitemap_files = [] end - def add(options) - raise ArgumentError, ':model and :path options must be provided' unless options[:model] && options[:path] - @sources << options.update(:path => strip_leading_slash(options[:path])) - self # Chainable + def add(model, options={}) + options[:path] ||= Extlib::Inflection.tableize(model.to_s) + @sources << [model, options.dup] + return self end def clean - unless @file_path.nil? - Dir.foreach(@file_path) do |f| - f = "#{@file_path}/#{f}" - File.delete(f) if File.file?(f) - end + Dir["#{@file_path}/sitemap_*.{xml,xml.gz}"].each do |file| + FileUtils.rm file end - self # Chainable + return self end def generate - @sources.each do |source| - klass = source[:model] + for model, options in @sources + count_method = pick_method(model, COUNT_METHODS) + find_method = pick_method(model, FIND_METHODS) + raise ArgumentError, "#{model} must provide a count_for_sitemap class method" if count_method.nil? + raise ArgumentError, "#{model} must provide a find_for_sitemap class method" if find_method.nil? - count_method = pick_method(klass, [:count_for_sitemap, :count]) - find_method = pick_method(klass, [:find_for_sitemap, :all]) - raise ArgumentError, "#{klass} must provide a count_for_sitemap class method" if count_method.nil? - raise ArgumentError, "#{klass} must provide a find_for_sitemap class method" if find_method.nil? - - count = klass.send(count_method) + count = model.send(count_method) num_sitemaps = 1 num_batches = 1 - if count > @batch_size - num_batches = (count.to_f / @batch_size.to_f).ceil - num_sitemaps = (count.to_f / @max_per_sitemap.to_f).ceil + if count > @options[:batch_size] + num_batches = (count.to_f / @options[:batch_size].to_f).ceil + num_sitemaps = (count.to_f / @options[:max_per_sitemap].to_f).ceil end batches_per_sitemap = num_batches.to_f / num_sitemaps.to_f - # Update the @sources hash so that the index file knows how many sitemaps to link to - source[:num_sitemaps] = num_sitemaps + find_options = options.dup for sitemap_num in 1..num_sitemaps # Work out the start and end batch numbers for this sitemap batch_num_start = sitemap_num == 1 ? 1 : ((sitemap_num * batches_per_sitemap).ceil - batches_per_sitemap + 1).to_i batch_num_end = (batch_num_start + [batches_per_sitemap, num_batches].min).floor - 1 # Stream XML output to a file - filename = "sitemap_#{Extlib::Inflection::underscore(klass.to_s)}" + filename = "sitemap_#{Extlib::Inflection::tableize(model.to_s)}" filename << "_#{sitemap_num}" if num_sitemaps > 1 - gz = gz_writer("#{filename}.xml.gz") + f = xml_open(filename) - xml = Builder::XmlMarkup.new(:target => gz) + xml = Builder::XmlMarkup.new(:target => f) xml.instruct! xml.urlset(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do for batch_num in batch_num_start..batch_num_end - offset = ((batch_num - 1) * @batch_size) - limit = (count - offset) < @batch_size ? (count - offset - 1) : @batch_size - find_options = num_batches > 1 ? {:limit => limit, :offset => offset} : {} + offset = ((batch_num - 1) * @options[:batch_size]) + limit = (count - offset) < @options[:batch_size] ? (count - offset - 1) : @options[:batch_size] + find_options.update(:limit => limit, :offset => offset) if num_batches > 1 - klass.send(find_method, find_options).each do |r| - last_mod_method = pick_method( - r, - [:updated_at, :updated_on, :updated, :created_at, :created_on, :created] - ) + model.send(find_method, find_options).each do |r| + last_mod_method = pick_method(r, TIMESTAMP_METHODS) last_mod = last_mod_method.nil? ? Time.now : r.send(last_mod_method) - param_method = pick_method(r, [:to_param, :id]) - raise ArgumentError, "#{klass} must provide a to_param instance method" if param_method.nil? + param_method = pick_method(r, PARAM_METHODS) xml.url do - xml.loc("#{@base_url}/#{source[:path]}/#{r.send(param_method)}") + location = defined?(Rails) ? + polymorphic_url(r) : + "#{root_url}/#{strip_leading_slash(options[:path])}/#{r.send(param_method)}" + xml.loc(location) + xml.lastmod(last_mod.strftime('%Y-%m-%d')) unless last_mod.nil? - xml.changefreq('weekly') + + change_frequency = options[:change_frequency] || 'weekly' + xml.changefreq(change_frequency.is_a?(Proc) ? change_frequency.call(r) : change_frequency) + + priority = options[:priority] + unless priority.nil? + xml.priority(priority.is_a?(Proc) ? priority.call(r) : priority) + end end end end end - gz.close + f.close end end generate_sitemap_index - ping_search_engines - self # Chainable + + return self end - private - def strip_leading_slash(str) - str.sub(/^\//, '') + def ping_search_engines + require 'net/http' + require 'cgi' + + sitemap_uri = CGI::escape(url_for_sitemap(@sitemap_files.last)) + + if @options[:ping_google] + Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}") end - def pick_method(klass, candidates) - method = nil - candidates.each do |candidate| - if klass.respond_to? candidate - method = candidate - break - end + if @options[:ping_yahoo] + if @options[:yahoo_app_id] + Net::HTTP.get( + 'search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?" + + "appid=#{@options[:yahoo_app_id]}&url=#{sitemap_uri}" + ) + else + $stderr.puts 'unable to ping Yahoo: no ":yahoo_app_id" provided' end - method end - def gz_writer(filename) - Zlib::GzipWriter.new(File.open("#{@file_path}/#{filename}", 'w+')) + if @options[:ping_msn] + Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}") end - def sitemap_index_filename - 'sitemap_index.xml.gz' + if @options[:pink_ask] + Net::HTTP.get('submissions.ask.com', "/ping?sitemap=#{sitemap_uri}") end + end - # Create a sitemap index document - def generate_sitemap_index - xml = '' - builder = Builder::XmlMarkup.new(:target => xml) - builder.instruct! - builder.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do - @sources.each do |source| - num_sitemaps = source[:num_sitemaps] - for i in 1..num_sitemaps - loc = "#{@base_url}/#{@web_path}/sitemap_#{Extlib::Inflection::underscore(source[:model].to_s)}" - loc << "_#{i}" if num_sitemaps > 1 - loc << '.xml.gz' + def root_url + @root_url ||= begin + url = '' + url << (@default_url_options[:protocol] || 'http') + url << '://' unless url.match('://') + url << @default_url_options[:host] + url << ":#{port}" if port = @default_url_options[:port] and port != 80 + end + end - builder.sitemap do - builder.loc(loc) - builder.lastmod(Time.now.strftime('%Y-%m-%d')) - end - end - end - end + private - gz = gz_writer(sitemap_index_filename) - gz.write(xml) - gz.close - end + def strip_leading_slash(str) + str.sub(/^\//, '') + end - def sitemap_uri - URI.escape("#{@base_url}/#{@web_path}/#{sitemap_index_filename}") + def pick_method(model, candidates) + method = nil + candidates.each do |candidate| + if model.respond_to? candidate + method = candidate + break + end end + method + end - # Notify Google of the new sitemap index file - def ping_google - Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}") - end + def xml_open(filename) + filename << '.xml' + filename << '.gz' if @options[:gzip] - # Notify Yahoo! of the new sitemap index file - def ping_yahoo - Net::HTTP.get('search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?appid=#{@yahoo_app_id}&url=#{sitemap_uri}") - end + file = File.open("#{@file_path}/#{filename}", 'w+') - # Notify MSN of the new sitemap index file - def ping_msn - Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}") - end + @sitemap_files << file.path - # Notify Ask of the new sitemap index file - def ping_ask - Net::HTTP.get('submissions.ask.com', "/ping?sitemap=#{sitemap_uri}") + writer = @options[:gzip] ? Zlib::GzipWriter.new(file) : file + + if block_given? + yield writer + writer.close end - def ping_search_engines - ping_google if @ping_google - ping_yahoo if @ping_yahoo && @yahoo_app_id - ping_msn if @ping_msn - ping_ask if @ping_ask + writer + end + + def url_for_sitemap(path) + "#{root_url}/#{File.basename(path)}" + end + + # Create a sitemap index document + def generate_sitemap_index + xml_open 'sitemap_index' do |file| + xml = Builder::XmlMarkup.new(:target => file) + xml.instruct! + xml.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do + for path in @sitemap_files[0..-2] + xml.sitemap do + xml.loc(url_for_sitemap(path)) + xml.lastmod(Time.now.strftime('%Y-%m-%d')) + end + end + end end -end \ No newline at end of file + end +end