require 'builder' # A LinkSet provisions a bunch of links to sitemap files. It also writes the index file # which lists all the sitemap files written. module SitemapGenerator class LinkSet @@requires_finalization_opts = [:filename, :sitemaps_path, :sitemaps_namer, :sitemaps_host] @@new_location_opts = [:filename, :sitemaps_path, :sitemaps_namer] attr_reader :default_host, :sitemaps_path, :filename attr_accessor :verbose, :yahoo_app_id, :include_root, :include_index, :sitemaps_host, :adapter, :yield_sitemap, :create_index # Create a new sitemap index and sitemap files. Pass a block calls to the following # methods: # * +add+ - Add a link to the current sitemap # * +group+ - Start a new group of sitemaps # # == Options # # Any option supported by +new+ can be passed. The options will be # set on the instance using the accessor methods. This is provided mostly # as a convenience. # # In addition to the options to +new+, the following options are supported: # * :finalize - The sitemaps are written as they get full and at the end # of the block. Pass +false+ as the value to prevent the sitemap or sitemap index # from being finalized. Default is +true+. # # If you are calling +create+ more than once in your sitemap configuration file, # make sure that you set a different +sitemaps_path+ or +filename+ for each call otherwise # the sitemaps may be overwritten. def create(opts={}, &block) reset! set_options(opts) if verbose start_time = Time.now puts "In #{sitemap_index.location.public_path}" end interpreter.eval(:yield_sitemap => yield_sitemap?, &block) finalize! end_time = Time.now if verbose output(sitemap_index.stats_summary(:time_taken => end_time - start_time)) if verbose self end # Dreprecated. Use create. def add_links(&block) original_value = @yield_sitemap @yield_sitemap = true create(&block) @yield_sitemap = original_value end # Constructor # # == Options: # * :adapter - instance of a class with a write method which takes a SitemapGenerator::Location # and raw XML data and persists it. The default adapter is a SitemapGenerator::FileAdapter # which simply writes files to the filesystem. You can use a SitemapGenerator::WaveAdapter # for uploading sitemaps to remote servers - useful for read-only hosts such as Heroku. Or # you can provide an instance of your own class to provide custom behavior. # # * :default_host - host including protocol to use in all sitemap links # e.g. http://en.google.ca # # * :public_path - Full or relative path to the directory to write sitemaps into. # Defaults to the public/ directory in your application root directory or # the current working directory. # # * :sitemaps_host - String. Host including protocol to use when generating # a link to a sitemap file i.e. the hostname of the server where the sitemaps are hosted. # The value will differ from the hostname in your sitemap links. # For example: `'http://amazon.aws.com/'`. # # Note that `include_index` is automatically turned off when the `sitemaps_host` does # not match `default_host`. Because the link to the sitemap index file that would # otherwise be added would point to a different host than the rest of the links in # the sitemap. Something that the sitemap rules forbid. # # * :sitemaps_path - path fragment within public to write sitemaps # to e.g. 'en/'. Sitemaps are written to public_path + sitemaps_path # # * :filename - symbol giving the base name for files (default :sitemap). # The sitemap names are generated like "#{filename}1.xml.gz", "#{filename}2.xml.gz" # and the index name is like "#{filename}_index.xml.gz". # # * :sitemaps_namer - A +SitemapNamer+ instance for generating the sitemap names. # # * :include_index - Boolean. Whether to add a link to the sitemap index # to the current sitemap. This points search engines to your Sitemap Index to # include it in the indexing of your site. Default is `false`. Turned off when # `sitemaps_host` is set or within a `group()` block. # # * :include_root - Boolean. Whether to **add the root** url i.e. '/' to the # current sitemap. Default is `true`. Turned off within a `group()` block. # # * :search_engines - Hash. A hash of search engine names mapped to # ping URLs. See ping_search_engines. # # * :verbose - If +true+, output a summary line for each sitemap and sitemap # index that is created. Default is +false+. # # * :create_index - Supported values: `true`, `false`, `:auto`. Default: `true`. # Whether to create a sitemap index file. If `true` an index file is always created, # regardless of how many links are in your sitemap. If `false` an index file is never # created. If `:auto` an index file is created only if your sitemap has more than # 50,000 (or SitemapGenerator::MAX_SITEMAP_LINKS) links. # # KJV: When adding a new option be sure to include it in `options_for_group()` if # the option should be inherited by groups. def initialize(options={}) options = SitemapGenerator::Utilities.reverse_merge(options, :include_root => true, :include_index => false, :filename => :sitemap, :search_engines => { :google => "http://www.google.com/webmasters/sitemaps/ping?sitemap=%s", :bing => "http://www.bing.com/webmaster/ping.aspx?siteMap=%s", :sitemap_writer => "http://www.sitemapwriter.com/notify.php?crawler=all&url=%s" }, :create_index => true ) options.each_pair { |k, v| instance_variable_set("@#{k}".to_sym, v) } # If an index is passed in, protect it from modification. # Sitemaps can be added to the index but nothing else can be changed. if options[:sitemap_index] @protect_index = true end end # Add a link to a Sitemap. If a new Sitemap is required, one will be created for # you. # # link - string link e.g. '/merchant', '/article/1' or whatever. # options - see README. # host - host for the link, defaults to your default_host. def add(link, options={}) add_default_links if !@added_default_links sitemap.add(link, SitemapGenerator::Utilities.reverse_merge(options, :host => @default_host)) rescue SitemapGenerator::SitemapFullError finalize_sitemap! retry rescue SitemapGenerator::SitemapFinalizedError @sitemap = sitemap.new retry end # Add a link to the Sitemap Index. # * link - A string link e.g. '/sitemaps/sitemap1.xml.gz' or a SitemapFile instance. # * options - A hash of options including `:lastmod`, ':priority`, ':changefreq` and `:host` # # The `:host` option defaults to the value of `sitemaps_host` which is the host where your # sitemaps reside. If no `sitemaps_host` is set, the `default_host` is used. def add_to_index(link, options={}) sitemap_index.add(link, SitemapGenerator::Utilities.reverse_merge(options, :host => sitemaps_host)) end # Create a new group of sitemap files. # # Returns a new LinkSet instance with the options passed in set on it. All groups # share the sitemap index, which is not affected by any of the options passed here. # # === Options # Any of the options to LinkSet.new. Except for :public_path which is shared # by all groups. # # The current options are inherited by the new group of sitemaps. The only exceptions # being :include_index and :include_root which default to +false+. # # Pass a block to add links to the new LinkSet. If you pass a block the sitemaps will # be finalized when the block returns. # # If you are not changing any of the location settings like filename, # sitemaps_path, sitemaps_host or sitemaps_namer # links you add within the group will be added to the current sitemap file (e.g. sitemap1.xml). # If one of these options is specified, the current sitemap file is finalized # and a new sitemap file started. # # Options like :default_host can be used and it will only affect the links # within the group. Links added outside of the group will revert to the previous # +default_host+. def group(opts={}, &block) @created_group = true original_opts = opts.dup if (@@requires_finalization_opts & original_opts.keys).empty? # If no new filename or path is specified reuse the default sitemap file. # A new location object will be set on it for the duration of the group. opts[:sitemap] = sitemap elsif original_opts.key?(:sitemaps_host) && (@@new_location_opts & original_opts.keys).empty? # If no location options are provided we are creating the next sitemap in the # current series, so finalize and inherit the namer. finalize_sitemap! opts[:sitemaps_namer] = sitemaps_namer end opts = options_for_group(opts) @group = SitemapGenerator::LinkSet.new(opts) if opts.key?(:sitemap) # If the group is sharing the current sitemap, set the # new location options on the location object. @original_location = @sitemap.location.dup @sitemap.location.merge!(@group.sitemap_location) if block_given? @group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block) @sitemap.location.merge!(@original_location) end elsif block_given? @group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block) @group.finalize_sitemap! end @group end # Ping search engines to notify them of updated sitemaps. # # Search engines are already notified for you if you run `rake sitemap:refresh`. # If you want to ping search engines separately to your sitemap generation, run # `rake sitemap:refresh:no_ping` and then run a rake task or script # which calls this method as in the example below. # # == Arguments # * sitemap_index_url - The full URL to your sitemap index file. # If not provided the location is based on the `host` you have # set and any other options like your `sitemaps_path`. The URL # will be CGI escaped for you when included as part of the # search engine ping URL. # # == Options # A hash of one or more search engines to ping in addition to the # default search engines. The key is the name of the search engine # as a string or symbol and the value is the full URL to ping with # a string interpolation that will be replaced by the CGI escaped sitemap # index URL. If you have any literal percent characters in your URL you # need to escape them with `%%`. For example if your sitemap index URL # is `http://example.com/sitemap_index.xml.gz` and your # ping url is `http://example.com/100%%/ping?url=%s` # then the final URL that is pinged will be `http://example.com/100%/ping?url=http%3A%2F%2Fexample.com%2Fsitemap_index.xml.gz` # # == Examples # # Both of these examples will ping the default search engines in addition to `http://superengine.com/ping?url=http%3A%2F%2Fexample.com%2Fsitemap_index.xml.gz` # # SitemapGenerator::Sitemap.host('http://example.com/') # SitemapGenerator::Sitemap.ping_search_engines(:super_engine => 'http://superengine.com/ping?url=%s') # # Is equivalent to: # # SitemapGenerator::Sitemap.ping_search_engines('http://example.com/sitemap_index.xml.gz', :super_engine => 'http://superengine.com/ping?url=%s') def ping_search_engines(*args) require 'cgi/session' require 'open-uri' require 'timeout' engines = args.last.is_a?(Hash) ? args.pop : {} index_url = CGI.escape(args.shift || sitemap_index_url) output("\n") search_engines.merge(engines).each do |engine, link| link = link % index_url name = Utilities.titleize(engine.to_s) begin Timeout::timeout(10) { open(link) } output("Successful ping of #{name}") rescue Timeout::Error, StandardError => e output("Ping failed for #{name}: #{e.inspect} (URL #{link})") end end end # Return a count of the total number of links in all sitemaps def link_count sitemap_index.total_link_count end # Return the host to use in links to the sitemap files. This defaults to your # +default_host+. def sitemaps_host @sitemaps_host || @default_host end # Lazy-initialize a sitemap instance and return it. def sitemap @sitemap ||= SitemapGenerator::Builder::SitemapFile.new(sitemap_location) end # Lazy-initialize a sitemap index instance and return it. def sitemap_index @sitemap_index ||= SitemapGenerator::Builder::SitemapIndexFile.new(sitemap_index_location) end # Return the full url to the sitemap index file. def sitemap_index_url sitemap_index.location.url end def finalize! finalize_sitemap! finalize_sitemap_index! end # Return a boolean indicating hether to add a link to the sitemap index file # to the current sitemap. This points search engines to your Sitemap Index so # they include it in the indexing of your site, but is not strictly neccessary. # Default is `true`. Turned off when `sitemaps_host` is set or within a `group()` block. def include_index? if default_host && sitemaps_host && sitemaps_host != default_host false else @include_index end end # Return a boolean indicating whether to automatically add the root url i.e. '/' to the # current sitemap. Default is `true`. Turned off within a `group()` block. def include_root? !!@include_root end # Set verbose on the instance or by setting ENV['VERBOSE'] to true or false. # By default verbose is true. When running rake tasks, pass the -s # option to rake to turn verbose off. def verbose if @verbose.nil? @verbose = SitemapGenerator.verbose.nil? ? true : SitemapGenerator.verbose end @verbose end # Return a boolean indicating whether or not to yield the sitemap. def yield_sitemap? @yield_sitemap.nil? ? SitemapGenerator.yield_sitemap? : !!@yield_sitemap end protected # Set each option on this instance using accessor methods. This will affect # both the sitemap and the sitemap index. # # If both `filename` and `sitemaps_namer` are passed, set filename first so it # doesn't override the latter. def set_options(opts={}) opts = opts.dup %w(filename sitemaps_namer).each do |key| if value = opts.delete(key.to_sym) send("#{key}=", value) end end opts.each_pair do |key, value| send("#{key}=", value) end end # Given +opts+, return a hash of options prepped for creating a new group from this LinkSet. # If :public_path is present in +opts+ it is removed because groups cannot # change the public path. def options_for_group(opts) opts = SitemapGenerator::Utilities.reverse_merge(opts, :include_index => false, :include_root => false, :sitemap_index => sitemap_index ) opts.delete(:public_path) # Reverse merge the current settings # KJV: This hash could be a problem because it needs to be maintained # when new options are added, but can easily be missed. We really could # do with a separate SitemapOptions class. current_settings = [ :include_root, :include_index, :sitemaps_path, :public_path, :sitemaps_host, :verbose, :default_host, :adapter, :create_index ].inject({}) do |hash, key| if value = instance_variable_get(:"@#{key}") hash[key] = value end hash end SitemapGenerator::Utilities.reverse_merge!(opts, current_settings) opts end # Add default links if those options are turned on. Record the fact that we have done so # in an instance variable. def add_default_links if include_root? sitemap.add('/', :lastmod => Time.now, :changefreq => 'always', :priority => 1.0, :host => @default_host) end if include_index? sitemap.add(sitemap_index, :lastmod => Time.now, :changefreq => 'always', :priority => 1.0) end @added_default_links = true end # Finalize a sitemap by including it in the index and outputting a summary line. # Do nothing if it has already been finalized. # # Don't finalize if the sitemap is empty and a group has been created. The reason # being that the group will have written out its sitemap. # # Add the default links if they have not been added yet and no groups have been created. # If the default links haven't been added we know that the sitemap is empty, # because they are added on the first call to add(). This ensure that if the # block passed to create() is empty the default links are still included in the # sitemap. def finalize_sitemap! return if sitemap.finalized? || sitemap.empty? && @created_group add_default_links if !@added_default_links && !@created_group # This will finalize it. We add to the index even if not creating an index because # the index keeps track of how many links are in our sitemaps and we need this info # for the summary line. If not for that problem, I would add the sitemap to # the index only if create_index is truthy. add_to_index(sitemap) output(sitemap.summary) end # Finalize a sitemap index and output a summary line. Do nothing if it has already # been finalized. def finalize_sitemap_index! return if @protect_index || !@create_index || sitemap_index.finalized? return if @create_index == :auto && sitemap_index.link_count <= 1 sitemap_index.finalize! output(sitemap_index.summary) end # Return the interpreter linked to this instance. def interpreter require 'sitemap_generator/interpreter' @interpreter ||= SitemapGenerator::Interpreter.new(:link_set => self) end # Reset this instance. Keep the same options, but return to the same state # as before an sitemaps were created. def reset! @sitemap_index = nil if @sitemap_index && @sitemap_index.finalized? && !@protect_index @sitemap = nil if @sitemap && @sitemap.finalized? self.sitemaps_namer.reset # start from 1 @added_default_links = false end # Write the given string to STDOUT. Used so that the sitemap config can be # evaluated and some info output to STDOUT in a lazy fasion. def output(string) return unless verbose puts string end module LocationHelpers public # Set the host name, including protocol, that will be used by default on each # of your sitemap links. You can pass a different host in your options to `add` # if you need to change it on a per-link basis. def default_host=(value) @default_host = value update_location_info(:host, value) end # Set the public_path. This path gives the location of your public directory. # The default is the public/ directory in your Rails root. Or if Rails is not # found, it defaults to public/ in the current directory (of the process). # # Example: 'tmp/' if you don't want to generate in public for some reason. # # Set to nil to use the current directory. def public_path=(value) @public_path = Pathname.new(value.to_s) @public_path = SitemapGenerator.app.root + @public_path if @public_path.relative? update_location_info(:public_path, @public_path) @public_path end # Return a Pathname with the full path to the public directory def public_path @public_path ||= self.send(:public_path=, 'public/') end # Set the sitemaps_path. This path gives the location to write sitemaps to # relative to your public_path. # Example: 'sitemaps/' to generate your sitemaps in 'public/sitemaps/'. def sitemaps_path=(value) @sitemaps_path = value update_location_info(:sitemaps_path, value) end # Set the host name, including protocol, that will be used on all links to your sitemap # files. Useful when the server that hosts the sitemaps is not on the same host as # the links in the sitemap. # # Note that `include_index` will be turned off to avoid adding a link to a sitemap with # a different host than the other links. def sitemaps_host=(value) @sitemaps_host = value update_location_info(:host, value) end # Set the filename base to use when generating sitemaps and sitemap indexes. # The index name will be +value+ with _index.xml.gz appended. # === Example # filename = :sitemap def filename=(value) @filename = value self.sitemaps_namer = SitemapGenerator::SitemapNamer.new(@filename) self.sitemap_index_namer = SitemapGenerator::SitemapIndexNamer.new("#{@filename}_index") end # Set the search engines hash to a new hash of search engine names mapped to # ping URLs (see ping_search_engines). If the value is nil it is converted # to an empty hash. # === Example # search_engines = { :google => "http://www.google.com/webmasters/sitemaps/ping?sitemap=%s" } def search_engines=(value) @search_engines = value || {} end # Return the hash of search engines. def search_engines @search_engines || {} end # Set the namer to use when generating SitemapFiles (does not apply to the # SitemapIndexFile) def sitemaps_namer=(value) @sitemaps_namer = value @sitemap.location[:namer] = value if @sitemap && !@sitemap.finalized? end # Return the current sitemaps namer object. If it not set, looks for it on # the current sitemap and if there is no sitemap, creates a new one using # the current filename. def sitemaps_namer @sitemaps_namer ||= @sitemap && @sitemap.location.namer || SitemapGenerator::SitemapNamer.new(@filename) end # Set the namer to use when generating SitemapFiles (does not apply to the # SitemapIndexFile) def sitemap_index_namer=(value) @sitemap_index_namer = value @sitemap_index.location[:namer] = value if @sitemap_index && !@sitemap_index.finalized? && !@protect_index end def sitemap_index_namer @sitemap_index_namer ||= @sitemap_index && @sitemap_index.location.namer || SitemapGenerator::SitemapIndexNamer.new("#{@filename}_index") end # Return a new +SitemapLocation+ instance with the current options included def sitemap_location SitemapGenerator::SitemapLocation.new( :host => sitemaps_host, :namer => sitemaps_namer, :public_path => public_path, :sitemaps_path => @sitemaps_path, :adapter => @adapter ) end # Return a new +SitemapIndexLocation+ instance with the current options included def sitemap_index_location SitemapGenerator::SitemapLocation.new( :host => sitemaps_host, :namer => sitemap_index_namer, :public_path => public_path, :sitemaps_path => @sitemaps_path, :adapter => @adapter ) end protected # Update the given attribute on the current sitemap index and sitemap file location objects. # But don't create the index or sitemap files yet if they are not already created. def update_location_info(attribute, value, opts={}) opts = SitemapGenerator::Utilities.reverse_merge(opts, :include_index => !@protect_index) @sitemap_index.location[attribute] = value if opts[:include_index] && @sitemap_index && !@sitemap_index.finalized? @sitemap.location[attribute] = value if @sitemap && !@sitemap.finalized? end end include LocationHelpers end end