require 'builder' # A LinkSet provisions a bunch of links to sitemap files. It also writes the index file # which lists all the sitemap files written. module SitemapGenerator class LinkSet @@requires_finalization_opts = [:filename, :sitemaps_path, :sitemaps_namer, :sitemaps_host] @@new_location_opts = [:filename, :sitemaps_path, :sitemaps_namer] attr_reader :default_host, :sitemaps_path, :filename attr_accessor :verbose, :yahoo_app_id, :include_root, :include_index, :sitemaps_host, :adapter # Create a new sitemap index and sitemap files. Pass a block calls to the following # methods: # * +add+ - Add a link to the current sitemap # * +group+ - Start a new group of sitemaps # # == Options # # Any option supported by +new+ can be passed. The options will be # set on the instance using the accessor methods. This is provided mostly # as a convenience. # # In addition to the options to +new+, the following options are supported: # * :finalize - The sitemaps are written as they get full and at the end # of the block. Pass +false+ as the value to prevent the sitemap or sitemap index # from being finalized. Default is +true+. # # If you are calling +create+ more than once in your sitemap configuration file, # make sure that you set a different +sitemaps_path+ or +filename+ for each call otherwise # the sitemaps may be overwritten. def create(opts={}, &block) reset! set_options(opts) start_time = Time.now if @verbose interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block) finalize! end_time = Time.now if @verbose puts sitemap_index.stats_summary(:time_taken => end_time - start_time) if @verbose self end # Dreprecated. Use create. def add_links(&block) @yield_sitemap = true create(&block) @yield_sitemap = false end # Constructor # # == Options: # * :adapter - instance of a class with a write method which takes a SitemapGenerator::Location # and raw XML data and persists it. The default adapter is a SitemapGenerator::FileAdapter # which simply writes files to the filesystem. You can use a SitemapGenerator::WaveAdapter # for uploading sitemaps to remote servers - useful for read-only hosts such as Heroku. Or # you can provide an instance of your own class to provide custom behavior. # # * :default_host - host including protocol to use in all sitemap links # e.g. http://en.google.ca # # * :public_path - Full or relative path to the directory to write sitemaps into. # Defaults to the public/ directory in your application root directory or # the current working directory. # # * :sitemaps_host - String. Host including protocol to use when generating # a link to a sitemap file i.e. the hostname of the server where the sitemaps are hosted. # The value will differ from the hostname in your sitemap links. # For example: `'http://amazon.aws.com/'`. # # Note that `include_index` is automatically turned off when the `sitemaps_host` does # not match `default_host`. Because the link to the sitemap index file that would # otherwise be added would point to a different host than the rest of the links in # the sitemap. Something that the sitemap rules forbid. # # * :sitemaps_path - path fragment within public to write sitemaps # to e.g. 'en/'. Sitemaps are written to public_path + sitemaps_path # # * :filename - symbol giving the base name for files (default :sitemap). # The sitemap names are generated like "#{filename}1.xml.gz", "#{filename}2.xml.gz" # and the index name is like "#{filename}_index.xml.gz". # # * :sitemaps_namer - A +SitemapNamer+ instance for generating the sitemap names. # # * include_index - Boolean. Whether to add a link to the sitemap index # to the current sitemap. This points search engines to your Sitemap Index to # include it in the indexing of your site. Default is `true`. Turned off when # `sitemaps_host` is set or within a `group()` block. # # * include_root - Boolean. Whether to **add the root** url i.e. '/' to the # current sitemap. Default is `true`. Turned off within a `group()` block. # # * :verbose - If +true+, output a summary line for each sitemap and sitemap # index that is created. Default is +false+. def initialize(options={}) options.reverse_merge!({ :include_root => true, :include_index => true, :filename => :sitemap, :verbose => false }) options.each_pair { |k, v| instance_variable_set("@#{k}".to_sym, v) } # If an index is passed in, protect it from modification. # Sitemaps can be added to the index but nothing else can be changed. if options[:sitemap_index] @protect_index = true end end # Add a link to a Sitemap. If a new Sitemap is required, one will be created for # you. # # link - string link e.g. '/merchant', '/article/1' or whatever. # options - see README. # host - host for the link, defaults to your default_host. def add(link, options={}) add_default_links if !@added_default_links sitemap.add(link, options.reverse_merge!(:host => @default_host)) rescue SitemapGenerator::SitemapFullError finalize_sitemap! retry rescue SitemapGenerator::SitemapFinalizedError @sitemap = sitemap.new retry end # Create a new group of sitemap files. # # Returns a new LinkSet instance with the options passed in set on it. All groups # share the sitemap index, which is not affected by any of the options passed here. # # === Options # Any of the options to LinkSet.new. Except for :public_path which is shared # by all groups. # # The current options are inherited by the new group of sitemaps. The only exceptions # being :include_index and :include_root which default to +false+. # # Pass a block to add links to the new LinkSet. If you pass a block the sitemaps will # be finalized when the block returns. # # If you are not changing any of the location settings like filename, # sitemaps_path, sitemaps_host or sitemaps_namer # links you add within the group will be added to the current sitemap file (e.g. sitemap1.xml). # If one of these options is specified, the current sitemap file is finalized # and a new sitemap file started. # # Options like :default_host can be used and it will only affect the links # within the group. Links added outside of the group will revert to the previous # +default_host+. def group(opts={}, &block) @created_group = true original_opts = opts.dup if (@@requires_finalization_opts & original_opts.keys).empty? # If no new filename or path is specified reuse the default sitemap file. # A new location object will be set on it for the duration of the group. opts[:sitemap] = sitemap elsif original_opts.key?(:sitemaps_host) && (@@new_location_opts & original_opts.keys).empty? # If no location options are provided we are creating the next sitemap in the # current series, so finalize and inherit the namer. finalize_sitemap! opts[:sitemaps_namer] = sitemaps_namer end opts = options_for_group(opts) @group = SitemapGenerator::LinkSet.new(opts) if opts.key?(:sitemap) # If the group is sharing the current sitemap, set the # new location options on the location object. @original_location = @sitemap.location.dup @sitemap.location.merge!(@group.sitemap_location) if block_given? @group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block) @sitemap.location.merge!(@original_location) end elsif block_given? @group.interpreter.eval(:yield_sitemap => @yield_sitemap || SitemapGenerator.yield_sitemap?, &block) @group.finalize_sitemap! end @group end # Ping search engines. # # @see http://en.wikipedia.org/wiki/Sitemap_index def ping_search_engines require 'open-uri' require 'timeout' sitemap_index_url = CGI.escape(sitemap_index.location.url) search_engines = { :google => "http://www.google.com/webmasters/sitemaps/ping?sitemap=#{sitemap_index_url}", :yahoo => "http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=#{yahoo_app_id}&url=#{sitemap_index_url}", :ask => "http://submissions.ask.com/ping?sitemap=#{sitemap_index_url}", :bing => "http://www.bing.com/webmaster/ping.aspx?siteMap=#{sitemap_index_url}", :sitemap_writer => "http://www.sitemapwriter.com/notify.php?crawler=all&url=#{sitemap_index_url}" } puts "\n" if verbose search_engines.each do |engine, link| next if engine == :yahoo && !self.yahoo_app_id begin Timeout::timeout(10) { open(link) } puts "Successful ping of #{engine.to_s.titleize}" if verbose rescue Timeout::Error, StandardError => e puts "Ping failed for #{engine.to_s.titleize}: #{e.inspect} (URL #{link})" if verbose end end if !self.yahoo_app_id && verbose puts "\n" puts <<-END.gsub(/^\s+/, '') To ping Yahoo you require a Yahoo AppID. Add it to your config/sitemap.rb with: SitemapGenerator::Sitemap.yahoo_app_id = "my_app_id" For more information see http://developer.yahoo.com/search/siteexplorer/V1/updateNotification.html END end end # Return a count of the total number of links in all sitemaps def link_count sitemap_index.total_link_count end # Return the host to use in links to the sitemap files. This defaults to your # +default_host+. def sitemaps_host @sitemaps_host || @default_host end # Lazy-initialize a sitemap instance when it's accessed def sitemap @sitemap ||= SitemapGenerator::Builder::SitemapFile.new(sitemap_location) end # Lazy-initialize a sitemap index instance when it's accessed def sitemap_index @sitemap_index ||= SitemapGenerator::Builder::SitemapIndexFile.new(sitemap_index_location) end def finalize! finalize_sitemap! finalize_sitemap_index! end # Return a boolean indicating hether to add a link to the sitemap index file # to the current sitemap. This points search engines to your Sitemap Index so # they include it in the indexing of your site, but is not strictly neccessary. # Default is `true`. Turned off when `sitemaps_host` is set or within a `group()` block. def include_index? if default_host && sitemaps_host && sitemaps_host != default_host false else @include_index end end # Return a boolean indicating whether to automatically add the root url i.e. '/' to the # current sitemap. Default is `true`. Turned off within a `group()` block. def include_root? !!@include_root end protected # Set each option on this instance using accessor methods. This will affect # both the sitemap and the sitemap index. # # If both `filename` and `sitemaps_namer` are passed, set filename first so it # doesn't override the latter. def set_options(opts={}) %w(filename sitemaps_namer).each do |key| if value = opts.delete(key.to_sym) send("#{key}=", value) end end opts.each_pair do |key, value| send("#{key}=", value) end end # Given +opts+, return a hash of options prepped for creating a new group from this LinkSet. # If :public_path is present in +opts+ it is removed because groups cannot # change the public path. def options_for_group(opts) opts.delete(:public_path) opts.reverse_merge!( :include_index => false, :include_root => false, :sitemap_index => sitemap_index ) # Reverse merge the current settings current_settings = [ :include_root, :include_index, :sitemaps_path, :public_path, :sitemaps_host, :verbose, :default_host ].inject({}) do |hash, key| if value = instance_variable_get(:"@#{key}") hash[key] = value end hash end opts.reverse_merge!(current_settings) opts end # Add default links if those options are turned on. Record the fact that we have done so # in an instance variable. def add_default_links if include_root? sitemap.add('/', :lastmod => Time.now, :changefreq => 'always', :priority => 1.0, :host => @default_host) end if include_index? sitemap.add(sitemap_index, :lastmod => Time.now, :changefreq => 'always', :priority => 1.0) end @added_default_links = true end # Finalize a sitemap by including it in the index and outputting a summary line. # Do nothing if it has already been finalized. # # Don't finalize if the sitemap is empty and a group has been created. The reason # being that the group will have written out its sitemap. # # Add the default links if they have not been added yet and no groups have been created. # If the default links haven't been added we know that the sitemap is empty, # because they are added on the first call to add(). This ensure that if the # block passed to create() is empty the default links are still included in the # sitemap. def finalize_sitemap! add_default_links if !@added_default_links && !@created_group return if sitemap.finalized? || sitemap.empty? && @created_group sitemap_index.add(sitemap) puts sitemap.summary if verbose end # Finalize a sitemap index and output a summary line. Do nothing if it has already # been finalized. def finalize_sitemap_index! return if @protect_index || sitemap_index.finalized? sitemap_index.finalize! puts sitemap_index.summary if verbose end # Return the interpreter linked to this instance. def interpreter require 'sitemap_generator/interpreter' @interpreter ||= SitemapGenerator::Interpreter.new(:link_set => self) end # Reset this instance. Keep the same options, but return to the same state # as before an sitemaps were created. def reset! @sitemap_index = nil if @sitemap_index && @sitemap_index.finalized? && !@protect_index @sitemap = nil if @sitemap && @sitemap.finalized? self.sitemaps_namer.reset # start from 1 @added_default_links = false end module LocationHelpers public # Set the host name, including protocol, that will be used by default on each # of your sitemap links. You can pass a different host in your options to `add` # if you need to change it on a per-link basis. def default_host=(value) @default_host = value update_location_info(:host, value) end # Set the public_path. This path gives the location of your public directory. # The default is the public/ directory in your Rails root. Or if Rails is not # found, it defaults to public/ in the current directory (of the process). # # Example: 'tmp/' if you don't want to generate in public for some reason. # # Set to nil to use the current directory. def public_path=(value) @public_path = Pathname.new(value.to_s) @public_path = SitemapGenerator.app.root + @public_path if @public_path.relative? update_location_info(:public_path, @public_path) @public_path end # Return a Pathname with the full path to the public directory def public_path @public_path ||= self.send(:public_path=, 'public/') end # Set the sitemaps_path. This path gives the location to write sitemaps to # relative to your public_path. # Example: 'sitemaps/' to generate your sitemaps in 'public/sitemaps/'. def sitemaps_path=(value) @sitemaps_path = value update_location_info(:sitemaps_path, value) end # Set the host name, including protocol, that will be used on all links to your sitemap # files. Useful when the server that hosts the sitemaps is not on the same host as # the links in the sitemap. # # Note that `include_index` will be turned off to avoid adding a link to a sitemap with # a different host than the other links. def sitemaps_host=(value) @sitemaps_host = value update_location_info(:host, value) end # Set the filename base to use when generating sitemaps and sitemap indexes. # The index name will be +value+ with _index.xml.gz appended. # === Example # filename = :sitemap def filename=(value) @filename = value self.sitemaps_namer = SitemapGenerator::SitemapNamer.new(@filename) self.sitemap_index_namer = SitemapGenerator::SitemapIndexNamer.new("#{@filename}_index") end # Set the namer to use when generating SitemapFiles (does not apply to the # SitemapIndexFile) def sitemaps_namer=(value) @sitemaps_namer = value @sitemap.location[:namer] = value if @sitemap && !@sitemap.finalized? end # Return the current sitemaps namer object. If it not set, looks for it on # the current sitemap and if there is no sitemap, creates a new one using # the current filename. def sitemaps_namer @sitemaps_namer ||= @sitemap && @sitemap.location.namer || SitemapGenerator::SitemapNamer.new(@filename) end # Set the namer to use when generating SitemapFiles (does not apply to the # SitemapIndexFile) def sitemap_index_namer=(value) @sitemap_index_namer = value @sitemap_index.location[:namer] = value if @sitemap_index && !@sitemap_index.finalized? && !@protect_index end def sitemap_index_namer @sitemap_index_namer ||= @sitemap_index && @sitemap_index.location.namer || SitemapGenerator::SitemapIndexNamer.new("#{@filename}_index") end # Return a new +SitemapLocation+ instance with the current options included def sitemap_location SitemapGenerator::SitemapLocation.new( :host => sitemaps_host, :namer => sitemaps_namer, :public_path => public_path, :sitemaps_path => @sitemaps_path, :adapter => @adapter ) end # Return a new +SitemapIndexLocation+ instance with the current options included def sitemap_index_location SitemapGenerator::SitemapLocation.new( :host => sitemaps_host, :namer => sitemap_index_namer, :public_path => public_path, :sitemaps_path => @sitemaps_path, :adapter => @adapter ) end protected # Update the given attribute on the current sitemap index and sitemap file location objects. # But don't create the index or sitemap files yet if they are not already created. def update_location_info(attribute, value, opts={}) opts.reverse_merge!(:include_index => !@protect_index) @sitemap_index.location[attribute] = value if opts[:include_index] && @sitemap_index && !@sitemap_index.finalized? @sitemap.location[attribute] = value if @sitemap && !@sitemap.finalized? end end include LocationHelpers end end