require 'builder'
# A LinkSet provisions a bunch of links to sitemap files. It also writes the index file
# which lists all the sitemap files written.
module SitemapGenerator
class LinkSet
@@requires_finalization_opts = [:filename, :sitemaps_path, :sitemaps_host, :namer]
@@new_location_opts = [:filename, :sitemaps_path, :namer]
attr_reader :default_host, :sitemaps_path, :filename, :create_index
attr_accessor :include_root, :include_index, :adapter, :yield_sitemap, :max_sitemap_links
attr_writer :verbose
# Create a new sitemap index and sitemap files. Pass a block with calls to the following
# methods:
# * +add+ - Add a link to the current sitemap
# * +group+ - Start a new group of sitemaps
#
# == Options
#
# Any option supported by +new+ can be passed. The options will be
# set on the instance using the accessor methods. This is provided mostly
# as a convenience.
#
# In addition to the options to +new+, the following options are supported:
# * :finalize - The sitemaps are written as they get full and at the end
# of the block. Pass +false+ as the value to prevent the sitemap or sitemap index
# from being finalized. Default is +true+.
#
# If you are calling +create+ more than once in your sitemap configuration file,
# make sure that you set a different +sitemaps_path+ or +filename+ for each call otherwise
# the sitemaps may be overwritten.
def create(opts={}, &block)
reset!
set_options(opts)
if verbose
start_time = Time.now
puts "In '#{sitemap_index.location.public_path}':"
end
interpreter.eval(:yield_sitemap => yield_sitemap?, &block)
finalize!
end_time = Time.now if verbose
output(sitemap_index.stats_summary(:time_taken => end_time - start_time)) if verbose
self
end
# Constructor
#
# == Options:
# * :adapter - instance of a class with a write method which takes a SitemapGenerator::Location
# and raw XML data and persists it. The default adapter is a SitemapGenerator::FileAdapter
# which simply writes files to the filesystem. You can use a SitemapGenerator::WaveAdapter
# for uploading sitemaps to remote servers - useful for read-only hosts such as Heroku. Or
# you can provide an instance of your own class to provide custom behavior.
#
# * :default_host - host including protocol to use in all sitemap links
# e.g. http://en.google.ca
#
# * :public_path - Full or relative path to the directory to write sitemaps into.
# Defaults to the public/ directory in your application root directory or
# the current working directory.
#
# * :sitemaps_host - String. Host including protocol to use when generating
# a link to a sitemap file i.e. the hostname of the server where the sitemaps are hosted.
# The value will differ from the hostname in your sitemap links.
# For example: `'http://amazon.aws.com/'`.
#
# Note that `include_index` is automatically turned off when the `sitemaps_host` does
# not match `default_host`. Because the link to the sitemap index file that would
# otherwise be added would point to a different host than the rest of the links in
# the sitemap. Something that the sitemap rules forbid.
#
# * :sitemaps_path - path fragment within public to write sitemaps
# to e.g. 'en/'. Sitemaps are written to public_path + sitemaps_path
#
# * :filename - symbol giving the base name for files (default :sitemap).
# The names are generated like "#{filename}.xml.gz", "#{filename}1.xml.gz", "#{filename}2.xml.gz"
# with the first file being the index if you have more than one sitemap file.
#
# * :include_index - Boolean. Whether to add a link pointing to the sitemap index
# to the current sitemap. This points search engines to your Sitemap Index to
# include it in the indexing of your site. Default is `false`. Turned off when
# `sitemaps_host` is set or within a `group()` block. Turned off because Google can complain
# about nested indexing and because if a robot is already reading your sitemap, they
# probably know about the index.
#
# * :include_root - Boolean. Whether to **add the root** url i.e. '/' to the
# current sitemap. Default is `true`. Turned off within a `group()` block.
#
# * :search_engines - Hash. A hash of search engine names mapped to
# ping URLs. See ping_search_engines.
#
# * :verbose - If +true+, output a summary line for each sitemap and sitemap
# index that is created. Default is +false+.
#
# * :create_index - Supported values: `true`, `false`, `:auto`. Default: `:auto`.
# Whether to create a sitemap index file. If `true` an index file is always created,
# regardless of how many links are in your sitemap. If `false` an index file is never
# created. If `:auto` an index file is created only if your sitemap has more than
# one sitemap file.
#
# * :namer - A SitemapGenerator::SimpleNamer instance for generating the sitemap
# and index file names. See :filename if you don't need to do anything fancy, and can
# accept the default naming conventions.
#
# * :compress - Specifies which files to compress with gzip. Default is `true`. Accepted values:
# * `true` - Boolean; compress all files.
# * `false` - Boolean; write out only uncompressed files.
# * `:all_but_first` - Symbol; leave the first file uncompressed but compress any remaining files.
#
# The compression setting applies to groups too. So :all_but_first will have the same effect (the first
# file in the group will not be compressed, the rest will). So if you require different behaviour for your
# groups, pass in a `:compress` option e.g. group(:compress => false) { add('/link') }
#
# * :max_sitemap_links - The maximum number of links to put in each sitemap.
# Default is `SitemapGenerator::MAX_SITEMAPS_LINKS`, or 50,000.
#
# Note: When adding a new option be sure to include it in `options_for_group()` if
# the option should be inherited by groups.
def initialize(options={})
@default_host, @sitemaps_host, @yield_sitemap, @sitemaps_path, @adapter, @verbose, @protect_index, @sitemap_index, @added_default_links, @created_group, @sitemap = nil
options = SitemapGenerator::Utilities.reverse_merge(options,
:include_root => true,
:include_index => false,
:filename => :sitemap,
:search_engines => {
:google => "http://www.google.com/webmasters/tools/ping?sitemap=%s",
:bing => "http://www.bing.com/webmaster/ping.aspx?sitemap=%s"
},
:create_index => :auto,
:compress => true,
:max_sitemap_links => SitemapGenerator::MAX_SITEMAP_LINKS
)
options.each_pair { |k, v| instance_variable_set("@#{k}".to_sym, v) }
# If an index is passed in, protect it from modification.
# Sitemaps can be added to the index but nothing else can be changed.
if options[:sitemap_index]
@protect_index = true
end
end
# Add a link to a Sitemap. If a new Sitemap is required, one will be created for
# you.
#
# link - string link e.g. '/merchant', '/article/1' or whatever.
# options - see README.
# host - host for the link, defaults to your default_host.
def add(link, options={})
add_default_links if !@added_default_links
sitemap.add(link, SitemapGenerator::Utilities.reverse_merge(options, :host => @default_host))
rescue SitemapGenerator::SitemapFullError
finalize_sitemap!
retry
rescue SitemapGenerator::SitemapFinalizedError
@sitemap = sitemap.new
retry
end
# Add a link to the Sitemap Index.
# * link - A string link e.g. '/sitemaps/sitemap1.xml.gz' or a SitemapFile instance.
# * options - A hash of options including `:lastmod`, ':priority`, ':changefreq` and `:host`
#
# The `:host` option defaults to the value of `sitemaps_host` which is the host where your
# sitemaps reside. If no `sitemaps_host` is set, the `default_host` is used.
def add_to_index(link, options={})
sitemap_index.add(link, SitemapGenerator::Utilities.reverse_merge(options, :host => sitemaps_host))
end
# Create a new group of sitemap files.
#
# Returns a new LinkSet instance with the options passed in set on it. All groups
# share the sitemap index, which is not affected by any of the options passed here.
#
# === Options
# Any of the options to LinkSet.new. Except for :public_path which is shared
# by all groups.
#
# The current options are inherited by the new group of sitemaps. The only exceptions
# being :include_index and :include_root which default to +false+.
#
# Pass a block to add links to the new LinkSet. If you pass a block the sitemaps will
# be finalized when the block returns.
#
# If you are not changing any of the location settings like filename,
# sitemaps_path, sitemaps_host or namer,
# links you add within the group will be added to the current sitemap.
# Otherwise the current sitemap file is finalized and a new sitemap file started,
# using the options you specified.
#
# Most commonly, you'll want to give the group's files a distinct name using
# the filename option.
#
# Options like :default_host can be used and it will only affect the links
# within the group. Links added outside of the group will revert to the previous
# +default_host+.
def group(opts={}, &block)
@created_group = true
original_opts = opts.dup
if (@@requires_finalization_opts & original_opts.keys).empty?
# If no new filename or path is specified reuse the default sitemap file.
# A new location object will be set on it for the duration of the group.
original_opts[:sitemap] = sitemap
elsif original_opts.key?(:sitemaps_host) && (@@new_location_opts & original_opts.keys).empty?
# If no location options are provided we are creating the next sitemap in the
# current series, so finalize and inherit the namer.
finalize_sitemap!
original_opts[:namer] = namer
end
opts = options_for_group(original_opts)
@group = SitemapGenerator::LinkSet.new(opts)
if opts.key?(:sitemap)
# If the group is sharing the current sitemap, set the
# new location options on the location object.
@original_location = @sitemap.location.dup
@sitemap.location.merge!(@group.sitemap_location)
if block_given?
@group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block)
@group.finalize_sitemap!
@sitemap.location.merge!(@original_location)
end
else
# Handle the case where a user only has one group, and it's being written
# to a new sitemap file. They would expect there to be an index. So force
# index creation. If there is more than one group, we would have an index anyways,
# so it's safe to force index creation in these other cases. In the case that
# the groups reuse the current sitemap, don't force index creation because
# we want the default behaviour i.e. only an index if more than one sitemap file.
# Don't force index creation if the user specifically requested no index. This
# unfortunately means that if they set it to :auto they may be getting an index
# when they didn't expect one, but you shouldn't be using groups if you only have
# one sitemap and don't want an index. Rather, just add the links directly in the create()
# block.
@group.send(:create_index=, true, true) if @group.create_index != false
if block_given?
@group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block)
@group.finalize_sitemap!
end
end
@group
end
# Ping search engines to notify them of updated sitemaps.
#
# Search engines are already notified for you if you run `rake sitemap:refresh`.
# If you want to ping search engines separately to your sitemap generation, run
# `rake sitemap:refresh:no_ping` and then run a rake task or script
# which calls this method as in the example below.
#
# == Arguments
# * sitemap_index_url - The full URL to your sitemap index file.
# If not provided the location is based on the `host` you have
# set and any other options like your `sitemaps_path`. The URL
# will be CGI escaped for you when included as part of the
# search engine ping URL.
#
# == Options
# A hash of one or more search engines to ping in addition to the
# default search engines. The key is the name of the search engine
# as a string or symbol and the value is the full URL to ping with
# a string interpolation that will be replaced by the CGI escaped sitemap
# index URL. If you have any literal percent characters in your URL you
# need to escape them with `%%`. For example if your sitemap index URL
# is `http://example.com/sitemap.xml.gz` and your
# ping url is `http://example.com/100%%/ping?url=%s`
# then the final URL that is pinged will be `http://example.com/100%/ping?url=http%3A%2F%2Fexample.com%2Fsitemap.xml.gz`
#
# == Examples
#
# Both of these examples will ping the default search engines in addition to `http://superengine.com/ping?url=http%3A%2F%2Fexample.com%2Fsitemap.xml.gz`
#
# SitemapGenerator::Sitemap.host('http://example.com/')
# SitemapGenerator::Sitemap.ping_search_engines(:super_engine => 'http://superengine.com/ping?url=%s')
#
# Is equivalent to:
#
# SitemapGenerator::Sitemap.ping_search_engines('http://example.com/sitemap.xml.gz', :super_engine => 'http://superengine.com/ping?url=%s')
def ping_search_engines(*args)
require 'cgi/session'
require 'open-uri'
require 'timeout'
engines = args.last.is_a?(Hash) ? args.pop : {}
unescaped_url = args.shift || sitemap_index_url
index_url = CGI.escape(unescaped_url)
output("\n")
output("Pinging with URL '#{unescaped_url}':")
search_engines.merge(engines).each do |engine, link|
link = link % index_url
name = Utilities.titleize(engine.to_s)
begin
Timeout::timeout(10) {
if URI.respond_to?(:open) # Available since Ruby 2.5
URI.open(link)
else
open(link) # using Kernel#open became deprecated since Ruby 2.7. See https://bugs.ruby-lang.org/issues/15893
end
}
output(" Successful ping of #{name}")
rescue Timeout::Error, StandardError => e
output("Ping failed for #{name}: #{e.inspect} (URL #{link})")
end
end
end
# Return a count of the total number of links in all sitemaps
def link_count
sitemap_index.total_link_count
end
# Return the host to use in links to the sitemap files. This defaults to your
# +default_host+.
def sitemaps_host
@sitemaps_host || @default_host
end
# Lazy-initialize a sitemap instance and return it.
def sitemap
@sitemap ||= SitemapGenerator::Builder::SitemapFile.new(sitemap_location)
end
# Lazy-initialize a sitemap index instance and return it.
def sitemap_index
@sitemap_index ||= SitemapGenerator::Builder::SitemapIndexFile.new(sitemap_index_location)
end
# Return the full url to the sitemap index file. When `create_index` is `false`
# the first sitemap is technically the index, so this will be its URL. It's important
# to use this method to get the index url because `sitemap_index.location.url` will
# not be correct in such situations.
#
# KJV: This is somewhat confusing.
def sitemap_index_url
sitemap_index.index_url
end
# All done. Write out remaining files.
def finalize!
finalize_sitemap!
finalize_sitemap_index!
end
# Return a boolean indicating hether to add a link to the sitemap index file
# to the current sitemap. This points search engines to your Sitemap Index so
# they include it in the indexing of your site, but is not strictly neccessary.
# Default is `true`. Turned off when `sitemaps_host` is set or within a `group()` block.
def include_index?
if default_host && sitemaps_host && sitemaps_host != default_host
false
else
@include_index
end
end
# Return a boolean indicating whether to automatically add the root url i.e. '/' to the
# current sitemap. Default is `true`. Turned off within a `group()` block.
def include_root?
!!@include_root
end
# Set verbose on the instance or by setting ENV['VERBOSE'] to true or false.
# By default verbose is true. When running rake tasks, pass the -s
# option to rake to turn verbose off.
def verbose
if @verbose.nil?
@verbose = SitemapGenerator.verbose.nil? ? true : SitemapGenerator.verbose
end
@verbose
end
# Return a boolean indicating whether or not to yield the sitemap.
def yield_sitemap?
@yield_sitemap.nil? ? SitemapGenerator.yield_sitemap? : !!@yield_sitemap
end
protected
# Set each option on this instance using accessor methods. This will affect
# both the sitemap and the sitemap index.
#
# If both `filename` and `namer` are passed, set filename first so it
# doesn't override the latter.
def set_options(opts={})
opts = opts.dup
%w(filename namer).each do |key|
if value = opts.delete(key.to_sym)
send("#{key}=", value)
end
end
opts.each_pair do |key, value|
send("#{key}=", value)
end
end
# Given +opts+, modify it and return it prepped for creating a new group from this LinkSet.
# If :public_path is present in +opts+ it is removed because groups cannot
# change the public path.
def options_for_group(opts)
opts = SitemapGenerator::Utilities.reverse_merge(opts,
:include_index => false,
:include_root => false,
:sitemap_index => sitemap_index
)
opts.delete(:public_path)
# Reverse merge the current settings.
#
# This hash could be a problem because it needs to be maintained
# when new options are added, but can easily be missed. We really could
# do with a separate SitemapOptions class.
current_settings = [
:include_root,
:include_index,
:sitemaps_path,
:public_path,
:sitemaps_host,
:verbose,
:default_host,
:adapter,
:create_index,
:compress,
:max_sitemap_links
].inject({}) do |hash, key|
value = instance_variable_get(:"@#{key}")
hash[key] = value unless value.nil?
hash
end
SitemapGenerator::Utilities.reverse_merge!(opts, current_settings)
opts
end
# Add default links if those options are turned on. Record the fact that we have done so
# in an instance variable.
def add_default_links
@added_default_links = true
link_options = { :lastmod => Time.now, :priority => 1.0 }
if include_root?
add('/', link_options)
end
if include_index?
add(sitemap_index, link_options)
end
end
# Finalize a sitemap by including it in the index and outputting a summary line.
# Do nothing if it has already been finalized.
#
# Don't finalize if the sitemap is empty.
#
# Add the default links if they have not been added yet and no groups have been created.
# If the default links haven't been added we know that the sitemap is empty,
# because they are added on the first call to add(). This ensure that if the
# block passed to create() is empty the default links are still included in the
# sitemap.
def finalize_sitemap!
return if sitemap.finalized? || sitemap.empty? && @created_group
add_default_links if !@added_default_links && !@created_group
# This will finalize it. We add to the index even if not creating an index because
# the index keeps track of how many links are in our sitemaps and we need this info
# for the summary line. Also the index determines which file gets the first name
# so everything has to go via the index.
add_to_index(sitemap) unless sitemap.empty?
end
# Finalize a sitemap index and output a summary line. Do nothing if it has already
# been finalized.
def finalize_sitemap_index!
return if @protect_index || sitemap_index.finalized?
sitemap_index.finalize!
sitemap_index.write
end
# Return the interpreter linked to this instance.
def interpreter
require 'sitemap_generator/interpreter'
@interpreter ||= SitemapGenerator::Interpreter.new(:link_set => self)
end
# Reset this instance. Keep the same options, but return to the same state
# as before any sitemaps were created.
def reset!
@sitemap_index = nil if @sitemap_index && @sitemap_index.finalized? && !@protect_index
@sitemap = nil if @sitemap && @sitemap.finalized?
self.namer.reset
@added_default_links = false
end
# Write the given string to STDOUT. Used so that the sitemap config can be
# evaluated and some info output to STDOUT in a lazy fasion.
def output(string)
return unless verbose
puts string
end
module LocationHelpers
public
# Set the host name, including protocol, that will be used by default on each
# of your sitemap links. You can pass a different host in your options to `add`
# if you need to change it on a per-link basis.
def default_host=(value)
@default_host = value
update_location_info(:host, value)
end
# Set the public_path. This path gives the location of your public directory.
# The default is the public/ directory in your Rails root. Or if Rails is not
# found, it defaults to public/ in the current directory (of the process).
#
# Example: 'tmp/' if you don't want to generate in public for some reason.
#
# Set to nil to use the current directory.
def public_path=(value)
@public_path = Pathname.new(SitemapGenerator::Utilities.append_slash(value))
if @public_path.relative?
@public_path = SitemapGenerator.app.root + @public_path
end
update_location_info(:public_path, @public_path)
@public_path
end
# Return a Pathname with the full path to the public directory
def public_path
@public_path ||= self.send(:public_path=, 'public/')
end
# Set the sitemaps_path. This path gives the location to write sitemaps to
# relative to your public_path.
# Example: 'sitemaps/' to generate your sitemaps in 'public/sitemaps/'.
def sitemaps_path=(value)
@sitemaps_path = value
update_location_info(:sitemaps_path, value)
end
# Set the host name, including protocol, that will be used on all links to your sitemap
# files. Useful when the server that hosts the sitemaps is not on the same host as
# the links in the sitemap.
#
# Note that `include_index` will be turned off to avoid adding a link to a sitemap with
# a different host than the other links.
def sitemaps_host=(value)
@sitemaps_host = value
update_location_info(:host, value)
end
# Set the filename base to use when generating sitemaps (and the sitemap index).
#
# === Example
# filename = :sitemap
#
# === Generates
# sitemap.xml.gz, sitemap1.xml.gz, sitemap2.xml.gz, ...
def filename=(value)
@filename = value
self.namer = SitemapGenerator::SimpleNamer.new(@filename)
end
# Set the search engines hash to a new hash of search engine names mapped to
# ping URLs (see ping_search_engines). If the value is nil it is converted
# to an empty hash.
# === Example
# search_engines = { :google => "http://www.google.com/webmasters/sitemaps/ping?sitemap=%s" }
def search_engines=(value)
@search_engines = value || {}
end
# Return the hash of search engines.
def search_engines
@search_engines || {}
end
# Return a new +SitemapLocation+ instance with the current options included
def sitemap_location
SitemapGenerator::SitemapLocation.new(
:host => sitemaps_host,
:namer => namer,
:public_path => public_path,
:sitemaps_path => @sitemaps_path,
:adapter => @adapter,
:verbose => verbose,
:compress => @compress,
:max_sitemap_links => max_sitemap_links
)
end
# Return a new +SitemapIndexLocation+ instance with the current options included
def sitemap_index_location
SitemapGenerator::SitemapLocation.new(
:host => sitemaps_host,
:namer => namer,
:public_path => public_path,
:sitemaps_path => @sitemaps_path,
:adapter => @adapter,
:verbose => verbose,
:create_index => @create_index,
:compress => @compress
)
end
# Set the value of +create_index+ on the SitemapIndexLocation object of the
# SitemapIndexFile.
#
# Whether to create a sitemap index file. Supported values: `true`, `false`, `:auto`.
# If `true` an index file is always created, regardless of how many links
# are in your sitemap. If `false` an index file is never created.
# If `:auto` an index file is created only if your sitemap has more than
# one sitemap file.
def create_index=(value, force=false)
@create_index = value
# Allow overriding the protected status of the index when we are creating a group.
# Because sometimes we need to force an index in that case. But generally we don't
# want to allow people to mess with this value if the index is protected.
@sitemap_index.location[:create_index] = value if @sitemap_index && ((!@sitemap_index.finalized? && !@protect_index) || force)
end
# Set the namer to use to generate the sitemap (and index) file names.
# This should be an instance of SitemapGenerator::SimpleNamer
def namer=(value)
@namer = value
@sitemap.location[:namer] = value if @sitemap && !@sitemap.finalized?
@sitemap_index.location[:namer] = value if @sitemap_index && !@sitemap_index.finalized? && !@protect_index
end
# Return the namer object. If it is not set, looks for it on
# the current sitemap and if there is no sitemap, creates a new one using
# the current filename.
def namer
@namer ||= @sitemap && @sitemap.location.namer || SitemapGenerator::SimpleNamer.new(@filename)
end
# Set the value of the compress setting.
#
# Values:
# * `true` - Boolean; compress all files
# * `false` - Boolean; write out only uncompressed files
# * `:all_but_first` - Symbol; leave the first file uncompressed but compress any remaining files.
#
# The compression setting applies to groups too. So :all_but_first will have the same effect (the first
# file in the group will not be compressed, the rest will). So if you require different behaviour for your
# groups, pass in a `:compress` option e.g. group(:compress => false) { add('/link') }
def compress=(value)
@compress = value
@sitemap_index.location[:compress] = @compress if @sitemap_index
@sitemap.location[:compress] = @compress if @sitemap
end
# Return the current compression setting. Its value determines which files will be gzip'ed.
# See the setter for documentation of its values.
def compress
@compress
end
protected
# Update the given attribute on the current sitemap index and sitemap file location objects.
# But don't create the index or sitemap files yet if they are not already created.
def update_location_info(attribute, value, opts={})
opts = SitemapGenerator::Utilities.reverse_merge(opts, :include_index => !@protect_index)
@sitemap_index.location[attribute] = value if opts[:include_index] && @sitemap_index && !@sitemap_index.finalized?
@sitemap.location[attribute] = value if @sitemap && !@sitemap.finalized?
end
end
include LocationHelpers
end
end