#-- # Copyright (c) 2005 Robert Aman # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #++ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production' # :nodoc: FEED_TOOLS_VERSION = "0.2.1" $:.unshift(File.dirname(__FILE__)) $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib") $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor") require 'rubygems' require 'active_record' begin require 'builder' rescue LoadError # RubyGems version is not available, use included Builder require 'feed_tools/vendor/builder' end begin require 'tidy' rescue LoadError # Ignore the error for now. end require 'feed_tools/vendor/htree' require 'net/http' require 'net/https' require 'net/ftp' require 'rexml/document' require 'iconv' require 'uri' require 'time' require 'cgi' require 'pp' require 'yaml' #= feed_tools.rb # # FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in # caching system. # #== Example # slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss') # slashdot_feed.title # => "Slashdot" # slashdot_feed.description # => "News for nerds, stuff that matters" # slashdot_feed.link # => "http://slashdot.org/" # slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s # => "43,37,28,23,11,3,1" module FeedTools # The default caching mechanism for the FeedTools module class DatabaseFeedCache < ActiveRecord::Base # Overrides the default table name to use the "feeds" table. def self.table_name() "feeds" end # If ActiveRecord is not already connected, attempts to find a configuration file and use # it to open a connection for ActiveRecord. # This method is probably unnecessary for anything but testing and debugging purposes. # In a Rails environment, the connection will already have been established # and this method will simply do nothing. # # This method should not raise any exceptions because it's designed to be run only when # the module is first loaded. If it fails, the user should get an exception when they # try to perform some action that makes use of the caching functionality, and not until. def DatabaseFeedCache.initialize_cache # Establish a connection if we don't already have one begin ActiveRecord::Base.connection rescue begin possible_config_files = [ "./config/database.yml", "../database.yml", "./database.yml" ] database_config_file = nil for file in possible_config_files if File.exists? file database_config_file = file break end end database_config_hash = File.open(database_config_file) do |file| config_hash = YAML::load(file) unless config_hash[FEED_TOOLS_ENV].nil? config_hash = config_hash[FEED_TOOLS_ENV] end config_hash end ActiveRecord::Base.configurations = database_config_hash ActiveRecord::Base.establish_connection(database_config_hash) ActiveRecord::Base.connection rescue end end # Verify that the necessary database tables are in place # and if they're missing, create them unless DatabaseFeedCache.table_exists? DatabaseFeedCache.create_table end return nil end # True if the appropriate database table already exists def DatabaseFeedCache.table_exists? begin ActiveRecord::Base.connection.execute "select id, url, title, " + "link, xml_data, http_headers, last_retrieved " + "from feeds limit 1" rescue ActiveRecord::StatementInvalid return false rescue return false end return true end # Creates the appropriate database table def DatabaseFeedCache.create_table unless DatabaseFeedCache.table_exists? feeds_mysql = <<-SQL_END CREATE TABLE `feeds` ( `id` int(10) unsigned NOT NULL auto_increment, `url` varchar(255) default NULL, `title` varchar(255) default NULL, `link` varchar(255) default NULL, `xml_data` longtext default NULL, `http_headers` text default NULL, `last_retrieved` datetime default NULL, PRIMARY KEY (`id`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1; SQL_END feeds_sqlite = <<-SQL_END CREATE TABLE 'feeds' ( 'id' INTEGER PRIMARY KEY NOT NULL, 'url' VARCHAR(255) DEFAULT NULL, 'title' VARCHAR(255) DEFAULT NULL, 'link' VARCHAR(255) DEFAULT NULL, 'image_link' VARCHAR(255) DEFAULT NULL, 'xml_data' TEXT DEFAULT NULL, 'http_headers' TEXT DEFAULT NULL, 'last_retrieved' DATETIME DEFAULT NULL, ); SQL_END feeds_psql = <<-SQL_END CREATE TABLE feeds ( id SERIAL PRIMARY KEY NOT NULL, url varchar(255) default NULL, title varchar(255) default NULL, link varchar(255) default NULL, xml_data text default NULL, http_headers text default NULL, last_retrieved datetime default NULL, ); SQL_END table_creation_sql = nil if configurations["adapter"] == "mysql" table_creation_sql = feeds_mysql elsif configurations["adapter"] == "sqlite" table_creation_sql = feeds_sqlite elsif configurations["adapter"] == "postgresql" table_creation_sql = feeds_psql end if table_creation_sql.nil? raise "Could not build feed_items table." else connection.execute table_creation_sql end end end end # Error raised when a feed cannot be retrieved class FeedAccessError < StandardError end # Quick method of enabling small classes to have their attributes # accessible as a dictionary. module AttributeDictionary # :nodoc: # Access the attributes as a dictionary def [](key) # Assignment, and destructive methods should not be # accessed like this. return nil if key[-1..-1] == "=" || key[-1..-1] == "!" return nil unless self.method(key).arity == 0 return self.send(key) end # Access the attributes as a dictionary def []=(key, value) # Assignment, and destructive methods should not be # accessed like this. return nil if key[-1..-1] == "=" || key[-1..-1] == "!" return nil unless self.method(key + "=").arity == 1 return self.send(key + "=", value) end end @feed_cache = DatabaseFeedCache @user_agent = "FeedTools/#{FEED_TOOLS_VERSION} " + "+http://www.sporkmonger.com/projects/feedtools/" # Returns the current caching mechanism. def FeedTools.feed_cache return @feed_cache end # Sets the current caching mechanism. If set to nil, disables caching. # Default is the DatabaseFeedCache class. # # Objects of this class must accept the following messages: # url # url= # title # title= # link # link= # xml_data # xml_data= # etag # etag= # last_modified # last_modified= # save # # Additionally, the class itself must accept the following messages: # find_by_id # find_by_url # initialize_cache def FeedTools.feed_cache=(new_feed_cache) # TODO: ensure that the feed cache class actually does those things. # ================================================================== @feed_cache = new_feed_cache end # Returns the currently used user agent string. def FeedTools.user_agent return @user_agent end # Sets the user agent string to send in the http headers. def FeedTools.user_agent=(new_user_agent) @user_agent = new_user_agent end # Returns true if the html tidy module can be used. # # Obviously, you need the tidy gem installed in order to run with html # tidy features turned on. # # This method does a fairly complicated, and probably unnecessarily # desperate search for the libtidy library. If you want this thing to # execute fast, the best thing to do is to set Tidy.path ahead of time. # If Tidy.path is set, this method doesn't do much. If it's not set, # it will do it's darnedest to find the libtidy library. If you set # the LIBTIDYPATH environment variable to the libtidy library, it should # be able to find it. # # Once the library is located, this method will run much faster. def FeedTools.tidy_enabled? # This is an override variable to keep tidy from being used even if it # is available. if @force_tidy_enabled == false return false end if @tidy_enabled.nil? || @tidy_enabled == false @tidy_enabled = false begin require 'tidy' if Tidy.path.nil? # *Shrug*, just brute force it, I guess. There's a lot of places # this thing might be hiding in, depending on platform and general # sanity of the person who installed the thing. Most of these are # probably unlikely, but it's not like checking unlikely locations # hurts. Much. Especially if you actually find it. libtidy_locations = [ '/usr/local/lib/libtidy.dylib', '/opt/local/lib/libtidy.dylib', '/usr/lib/libtidy.dylib', '/usr/local/lib/tidylib.dylib', '/opt/local/lib/tidylib.dylib', '/usr/lib/tidylib.dylib', '/usr/local/lib/tidy.dylib', '/opt/local/lib/tidy.dylib', '/usr/lib/tidy.dylib', '/usr/local/lib/libtidy.so', '/opt/local/lib/libtidy.so', '/usr/lib/libtidy.so', '/usr/local/lib/tidylib.so', '/opt/local/lib/tidylib.so', '/usr/lib/tidylib.so', '/usr/local/lib/tidy.so', '/opt/local/lib/tidy.so', '/usr/lib/tidy.so', 'C:\Program Files\Tidy\tidy.dll', 'C:\Tidy\tidy.dll', '/usr/local/lib', '/opt/local/lib', '/usr/lib' ] # We just made this thing up, but if someone sets it, we'll # go ahead and check it unless ENV['LIBTIDYPATH'].nil? libtidy_locations = libtidy_locations.reverse.push(ENV['LIBTIDYPATH']) end for path in libtidy_locations if File.exists? path if File.ftype(path) == "file" Tidy.path = path @tidy_enabled = true break elsif File.ftype(path) == "directory" # Ok, now perhaps we're getting a bit more desperate lib_paths = `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'` # If there's more than one, grab the first one and # hope for the best, and if it doesn't work, then blame the # user for not specifying more accurately. tidy_path = lib_paths.split("\n").first unless tidy_path.nil? Tidy.path = tidy_path @tidy_enabled = true break end end end end # Still couldn't find it. unless @tidy_enabled @tidy_enabled = false end else @tidy_enabled = true end rescue LoadError # Tidy not installed, disable features that rely on tidy. @tidy_enabled = false end end return @tidy_enabled end # Turns html tidy support on or off. Be aware, that setting this to true # does not mean tidy will be enabled. It simply means that tidy will be # enabled if it is available to be enabled. def FeedTools.tidy_enabled=(new_tidy_enabled) @force_tidy_enabled = new_tidy_enabled end # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls # and makes every effort to figure out what it was supposed to be. Also translates from # the feed: and rss: pseudo-protocols to the http: protocol. def FeedTools.normalize_url(url) if url.nil? || url == "" return nil end normalized_url = url # if a url begins with the '/' character, it only makes sense that they # meant to be using a file:// url. Fix it for them. if normalized_url.length > 0 && normalized_url[0..0] == "/" normalized_url = "file://" + normalized_url end # if a url begins with javascript:, it's quite possibly an attempt at # doing something malicious. Let's keep that from getting anywhere, # shall we? if (normalized_url.downcase =~ /javascript:/) != nil return "#" end # deal with all of the many ugly possibilities involved in the rss: # and feed: pseudo-protocols (incidentally, whose crazy idea was this # mess?) normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://") normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://") normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://") normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://") normalized_url.gsub!(/^file:\/*/, "file:///") normalized_url.gsub!(/^https:\/*/, "https://") # fix (very) bad urls (usually of the user-entered sort) normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://") if (normalized_url =~ /^file:/) == 0 # fix bad Windows-based entries normalized_url.gsub!(/file:\/\/\/([a-zA-Z]):/, 'file:///\1|') # maybe this is too aggressive? normalized_url.gsub!(/\\/, '/') return normalized_url else if (normalized_url =~ /https?:\/\//) == nil normalized_url = "http://" + normalized_url end if normalized_url == "http://" return nil end begin feed_uri = URI.parse(normalized_url) if feed_uri.scheme == nil feed_uri.scheme = "http" end if feed_uri.path == nil || feed_uri.path == "" feed_uri.path = "/" end if (feed_uri.path =~ /^[\/]+/) == 0 feed_uri.path.gsub!(/^[\/]+/, "/") end return feed_uri.to_s rescue URI::InvalidURIError return normalized_url end end end # Returns true if the parameter appears to be a valid url def FeedTools.is_url?(url) return false if url.nil? begin uri = URI.parse(url) rescue URI::InvalidURIError return false end return true end # Removes all html tags from the html formatted text. def FeedTools.strip_html(html) # TODO: do this properly # ====================== stripped_html = html.gsub(/<\/?[^>]+>/, "") return stripped_html end # Tidys up the html def FeedTools.tidy_html(html) if FeedTools.tidy_enabled? is_fragment = true if (html.strip =~ /(.|\n)*/) != nil || (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil is_fragment = false end if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil is_fragment = false end tidy_html = Tidy.open(:show_warnings=>false) do |tidy| tidy.options.output_xml = true tidy.options.indent = false tidy.options.wrap_attributes = true tidy.options.logical_emphasis = true tidy.options.doctype = "omit" xml = tidy.clean(html) xml end if is_fragment # Tidy puts ...[our html]... in. # We don't want this. tidy_html.strip! tidy_html.gsub!(/^(.|\n)*/, "") tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "") tidy_html.strip! end else tidy_html = html end return tidy_html end # Removes all dangerous html tags from the html formatted text. # If mode is set to :escape, dangerous and unknown elements will # be escaped. If mode is set to :strip, dangerous and unknown # elements and all children will be removed entirely. # Dangerous or unknown attributes are always removed. def FeedTools.sanitize_html(html, mode=:escape) # Lists borrowed from Mark Pilgrim's feedparser acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] # Stupid hack to pass this unit test: # http://feedparser.org/tests/wellformed/rss/ # item_description_not_a_doctype.xml html.gsub!(/" + html + "").to_rexml sanitize_node = lambda do |html_node| if html_node.respond_to? :children for child in html_node.children if child.kind_of? REXML::Element unless acceptable_elements.include? child.name if mode == :strip html_node.delete_element(child) else new_child = REXML::Text.new(CGI.escapeHTML(child.to_s)) html_node.insert_after(child, new_child) html_node.delete_element(child) end end for attribute in child.attributes.keys unless acceptable_attributes.include? attribute child.delete_attribute(attribute) end end end sanitize_node.call(child) end end html_node end sanitize_node.call(html_doc.root) return html_doc.root.inner_xml end class Feed include REXML include AttributeDictionary # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired. def Feed.open(url) # clean up the url url = FeedTools.normalize_url(url) # create and load the new feed feed = Feed.new feed.url = url feed.update return feed end # Loads the feed from the remote url if the feed has expired from the cache or cannot be # retrieved from the cache for some reason. def update if self.http_headers.nil? && !(self.cache_object.nil?) && !(self.cache_object.http_headers.nil?) @http_headers = YAML.load(self.cache_object.http_headers) end if expired? load_remote_feed else @live = false end end # Attempts to load the feed from the remote location. Requires the url # field to be set. If an etag or the last_modified date has been set, # attempts to use them to prevent unnecessary reloading of identical # content. def load_remote_feed @live = true if self.http_headers.nil? && !(self.cache_object.nil?) && !(self.cache_object.http_headers.nil?) @http_headers = YAML.load(self.cache_object.http_headers) end if (self.url =~ /^feed:/) == 0 # Woah, Nelly, how'd that happen? You should've already been # corrected. So let's fix that url. And please, # just use less crappy browsers instead of badly defined # pseudo-protocol hacks. self.url = FeedTools.normalize_url(self.url) end # Find out what method we're going to be using to obtain this feed. uri = URI.parse(self.url) retrieval_method = "http" case uri.scheme when "http" retrieval_method = "http" when "ftp" retrieval_method = "ftp" when "file" retrieval_method = "file" when nil raise FeedAccessError, "No protocol was specified in the url." else raise FeedAccessError, "Cannot retrieve feed using unrecognized protocol: " + uri.scheme end # No need for http headers unless we're actually doing http if retrieval_method == "http" # Set up the appropriate http headers headers = {} unless self.http_headers.nil? headers["If-None-Match"] = self.http_headers['etag'] unless self.http_headers['etag'].nil? headers["If-Modified-Since"] = self.http_headers['last-modified'] unless self.http_headers['last-modified'].nil? end headers["User-Agent"] = FeedTools.user_agent unless FeedTools.user_agent.nil? # The http feed access method def http_fetch(feed_url, http_headers, redirect_limit = 10, response_chain = []) # :nodoc: raise FeedAccessError, 'Redirect too deep' if redirect_limit == 0 feed_uri = nil begin feed_uri = URI.parse(feed_url) rescue URI::InvalidURIError # Uh, maybe try to fix it? feed_uri = URI.parse(FeedTools.normalize_url(feed_url)) end # Borrowed from open-uri: # According to RFC2616 14.23, Host: request-header field should be # set to an origin server. # But net/http wrongly set a proxy server if an absolute URI is # specified as a request URI. # So override it here explicitly. http_headers['Host'] = feed_uri.host http_headers['Host'] += ":#{feed_uri.port}" if feed_uri.port Net::HTTP.start(feed_uri.host, (feed_uri.port or 80)) do |http| response = http.request_get(feed_uri.path, http_headers) case response when Net::HTTPSuccess # We've reached the final destination, process all previous # redirections, and see if we need to update the url. for redirected_response in response_chain if redirected_response.last.code.to_i == 301 self.url = redirected_response.first else # Jump out as soon as we hit anything that isn't a # permanently moved redirection. break end end return response when Net::HTTPRedirection if response.code.to_i == 304 response.error! else if response['Location'].nil? raise FeedAccessError, "No location to redirect to supplied: " + response.code end response_chain << [feed_url, response] new_location = response['location'] if response_chain.assoc(new_location) != nil raise FeedAccessError, "Redirection loop detected." end # TODO: deal with stupid people using relative urls # in Location header # ================================================= http_fetch(new_location, http_headers, redirect_limit - 1, response_chain) end else response.error! end end end begin @http_response = http_fetch(self.url, headers) @http_headers = {} self.http_response.each_header do |header| self.http_headers[header.first.downcase] = header.last end self.last_retrieved = Time.now self.xml_data = self.http_response.body rescue FeedAccessError @live = false if self.xml_data.nil? raise end rescue Timeout::Error # if we time out, do nothing, it should fall back to the xml_data # stored in the cache. @live = false if self.xml_data.nil? raise end rescue Errno::ECONNRESET # if the connection gets reset by peer, oh well, fall back to the # xml_data stored in the cache @live = false if self.xml_data.nil? raise end rescue => error # heck, if anything at all bad happens, fall back to the xml_data # stored in the cache. # If we can, get the HTTPResponse... @http_response = nil if error.respond_to?(:each_header) @http_response = error end if error.respond_to?(:response) && error.response.respond_to?(:each_header) @http_response = error.response end if @http_response != nil @http_headers = {} self.http_response.each_header do |header| self.http_headers[header.first] = header.last end if self.http_response.code.to_i == 304 self.last_retrieved = Time.now end end @live = false if self.xml_data.nil? raise end end elsif retrieval_method == "https" # Not supported... yet elsif retrieval_method == "ftp" # Not supported... yet # Technically, CDF feeds are supposed to be able to be accessed directly # from an ftp server. This is silly, but we'll humor Microsoft. # # Eventually. elsif retrieval_method == "file" # Now that we've gone to all that trouble to ensure the url begins # with 'file://', strip the 'file://' off the front of the url. file_name = self.url.gsub(/^file:\/\//, "") begin open(file_name) do |file| @http_response = nil @http_headers = {} self.last_retrieved = Time.now self.xml_data = file.read end rescue @live = false # In this case, pulling from the cache is probably not going # to help at all, and the use should probably be immediately # appraised of the problem. Raise the exception. raise end end unless self.cache_object.nil? begin self.save rescue end end end # Returns the relevant information from an http request. def http_response return @http_response end # Returns a hash of the http headers from the response. def http_headers return @http_headers end # Returns the feed's raw xml data. def xml_data if @xml_data.nil? unless self.cache_object.nil? @xml_data = self.cache_object.xml_data end end return @xml_data end # Sets the feed's xml data. def xml_data=(new_xml_data) @xml_data = new_xml_data unless self.cache_object.nil? self.cache_object.xml_data = new_xml_data end end # Returns a REXML Document of the xml_data def xml if @xml_doc.nil? begin @xml_doc = Document.new(xml_data) rescue # Something failed, attempt to repair the xml with htree. @xml_doc = HTree.parse(xml_data).to_rexml end end return @xml_doc end # Returns the first node within the channel_node that matches the xpath query. def find_node(xpath) return XPath.first(channel_node, xpath) end # Returns all nodes within the channel_node that match the xpath query. def find_all_nodes(xpath) return XPath.match(channel_node, xpath) end # Returns the root node of the feed. def root_node if @root_node.nil? @root_node = xml.root end return @root_node end # Returns the channel node of the feed. def channel_node if @channel_node.nil? @channel_node = XPath.first(root_node, "channel") if @channel_node == nil @channel_node = XPath.first(root_node, "feedinfo") end if @channel_node == nil @channel_node = root_node end end return @channel_node end # The cache object that handles the feed persistence. def cache_object unless FeedTools.feed_cache.nil? if @cache_object.nil? begin if @id != nil @cache_object = FeedTools.feed_cache.find_by_id(@id) elsif @url != nil @cache_object = FeedTools.feed_cache.find_by_url(@url) end if @cache_object.nil? @cache_object = FeedTools.feed_cache.new end rescue end end end return @cache_object end # Sets the cache object for this feed. # # This can be any object, but it must accept the following messages: # url # url= # title # title= # link # link= # xml_data # xml_data= # etag # etag= # last_modified # last_modified= # save def cache_object=(new_cache_object) @cache_object = new_cache_object end # Returns the feed's unique id def id if @id.nil? @id = XPath.first(root_node, "id/text()").to_s if @id == "" @id = XPath.first(root_node, "guid/text()").to_s end @id = nil if @id == "" end return @id end # Sets the feed's unique id def id=(new_id) @id = new_id end # Returns the feed url. def url if @url.nil? && self.xml_data != nil @url = XPath.first(channel_node, "link[@rel='self']/@href").to_s @url = nil if @url == "" end return @url end # Sets the feed url and prepares the cache_object if necessary. def url=(new_url) @url = FeedTools.normalize_url(new_url) self.cache_object.url = new_url unless self.cache_object.nil? end # Returns the feed title def title if @title.nil? if XPath.first(channel_node, "title/@type").to_s == "xhtml" || XPath.first(channel_node, "title/@mode").to_s == "xhtml" @title = XPath.first(channel_node, "title").inner_xml elsif XPath.first(channel_node, "title/@type").to_s == "escaped" || XPath.first(channel_node, "title/@mode").to_s == "escaped" @title = CGI.unescapeHTML( XPath.first(channel_node, "title/text()").to_s) else @title = CGI.unescapeHTML( XPath.first(channel_node, "title/text()").to_s) end unless @title.nil? @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip)) end if @title != "" && !(@title.nil?) @title = FeedTools.strip_html(@title).strip end @title.gsub!(/\n/, " ") @title = nil if @title == "" self.cache_object.title = @title unless self.cache_object.nil? end return @title end # Sets the feed title def title=(new_title) @title = new_title self.cache_object.title = new_title unless self.cache_object.nil? end # Returns the feed description def description if @description.nil? # get the feed description from the xml document @description = XPath.first(channel_node, "description/text()").to_s if @description != "" if XPath.first(channel_node, "description/@encoding").to_s != "" @description = "[Embedded data objects are not supported.]" else @description = CGI.unescapeHTML(description) end end if @description == "" @description = XPath.first(channel_node, "subtitle/text()").to_s if @description != "" && XPath.first(channel_node, "subtitle/@mode").to_s == "escaped" @description = CGI.unescapeHTML(description) end end if @description == "" @description = XPath.first(channel_node, "tagline/text()").to_s if @description != "" && XPath.first(channel_node, "tagline/@mode").to_s == "escaped" @description = CGI.unescapeHTML(description) end end if @description == "" && XPath.first(channel_node, "tagline") == nil @description = XPath.first(channel_node, "info/text()").to_s if @description != "" && XPath.first(channel_node, "info/@mode").to_s == "escaped" @description = CGI.unescapeHTML(description) end end if @description == "" @description = CGI.unescapeHTML( XPath.first(channel_node, "abstract/text()").to_s) end if @description == "" @description = CGI.unescapeHTML( XPath.first(channel_node, "summary/text()").to_s) end if @description == "" # I don't think this is valid for anyone to do, but this is probably # what they meant if they do it. @description = CGI.unescapeHTML( XPath.first(channel_node, "content:encoded/text()").to_s) if @description != "" @bozo = true end end if @description == "" begin @description = XPath.first(channel_node, "description").inner_xml rescue end end if @description == "" @description = self.itunes_summary @description = "" if @description.nil? end if @description == "" @description = self.itunes_subtitle @description = "" if @description.nil? end @description = FeedTools.sanitize_html(@description) unless @description.nil? # If it started with a bunch of divs, hack them right off. We can put # them back later if they're needed. @description.gsub!(/^(]*>)*/, "") @description.gsub!(/(<\/div>)*$/, "") @description.gsub!(/\n/, " ") if @description.size < 80 @description = @description.strip unless @description.nil? @description = nil if @description == "" end return @description end # Sets the feed description def description=(new_description) @description = new_description end # Returns the contents of the itunes:summary element def itunes_summary if @itunes_summary.nil? @itunes_summary = CGI.unescapeHTML(XPath.first(root_node, "itunes:summary/text()").to_s) if @itunes_summary == "" @itunes_summary = nil end @itunes_summary = FeedTools.sanitize_html(@itunes_summary) unless @itunes_summary.nil? end return @itunes_summary end # Sets the contents of the itunes:summary element def itunes_summary=(new_itunes_summary) @itunes_summary = new_itunes_summary end # Returns the contents of the itunes:subtitle element def itunes_subtitle if @itunes_subtitle.nil? @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node, "itunes:subtitle/text()").to_s) if @itunes_subtitle == "" @itunes_subtitle = nil end unless @itunes_subtitle.nil? @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle) end end return @itunes_subtitle end # Sets the contents of the itunes:subtitle element def itunes_subtitle=(new_itunes_subtitle) @itunes_subtitle = new_itunes_subtitle end # Returns the feed link def link if @link.nil? # get the feed link from the xml document @link = XPath.first(channel_node, "link[@rel='alternate' @type='text/html']/@href").to_s if @link == "" @link = XPath.first(channel_node, "link[@rel='alternate']/@href").to_s end if @link == "" @link = XPath.first(channel_node, "link/@href").to_s end if @link == "" @link = XPath.first(channel_node, "link/text()").to_s end if @link == "" @link = XPath.first(channel_node, "@href").to_s end if @link == "" if FeedTools.is_url? self.guid @link = self.guid end end if @link == "" # Technically, we shouldn't use the base attribute for this, but if the href attribute # is missing, it's already a given that we're looking at a messed up CDF file. We can # always pray it's correct. @link = XPath.first(channel_node, "@base").to_s end @link = FeedTools.normalize_url(@link) unless self.cache_object.nil? self.cache_object.link = @link end end return @link end # Sets the feed link def link=(new_link) @link = new_link unless self.cache_object.nil? self.cache_object.link = new_link end end # Returns the feed image link def image_link if @image_link.nil? # get the feed image link from the xml document @image_link = XPath.first(channel_node, "image/url/text()").to_s if @image_link == "" @image_link = XPath.first(channel_node, "image/@rdf:resource").to_s end if @image_link == "" @image_link = XPath.first(channel_node, "link[@type='image/jpeg']/@href").to_s end if @image_link == "" @image_link = XPath.first(channel_node, "link[@type='image/gif']/@href").to_s end if @image_link == "" @image_link = XPath.first(channel_node, "link[@type='image/png']/@href").to_s end if @image_link == "" @image_link = XPath.first(channel_node, "logo[@style='image']/@href").to_s end if @image_link == "" @image_link = XPath.first(channel_node, "logo/@href").to_s end @image_link = FeedTools.normalize_url(@image_link) end return @image_link end # Sets the feed image link def image_link=(new_image_link) @image_link = new_image_link end # Returns the url to the icon file for this feed. # # This method uses the url from the link field in order to avoid grabbing # the favicon for services like feedburner. def icon_link if @icon_link.nil? @icon_link = XPath.first(channel_node, "link[@rel='icon']/@href").to_s if @icon_link == "" @icon_link = XPath.first(channel_node, "link[@rel='shortcut icon']/@href").to_s end if @icon_link == "" @icon_link = XPath.first(channel_node, "link[@type='image/x-icon']/@href").to_s end if @icon_link == "" @icon_link = XPath.first(channel_node, "icon/@href").to_s end if @icon_link == "" @icon_link = XPath.first(channel_node, "icon/text()").to_s end if @icon_link == "" link_uri = URI.parse(FeedTools.normalize_url(self.link)) @icon_link = link_uri.scheme + "://" + link_uri.host + "/favicon.ico" end end return @icon_link end # Returns the number of seconds before the feed should expire def time_to_live if @time_to_live.nil? # get the feed time to live from the xml document update_frequency = XPath.first(channel_node, "syn:updateFrequency/text()").to_s if update_frequency != "" update_period = XPath.first(channel_node, "syn:updatePeriod/text()").to_s if update_period == "daily" @time_to_live = update_frequency.to_i * 24 elsif update_period == "weekly" @time_to_live = update_frequency.to_i * 24 * 7 elsif update_period == "monthly" @time_to_live = update_frequency.to_i * 24 * 30 elsif update_period == "yearly" @time_to_live = update_frequency.to_i * 24 * 365 else # hourly @time_to_live = update_frequency.to_i end end end if @time_to_live.nil? # expressed in minutes update_frequency = XPath.first(channel_node, "ttl/text()").to_s if update_frequency != "" @time_to_live = (update_frequency.to_i / 60) end end if @time_to_live.nil? @time_to_live = 0 update_frequency_days = XPath.first(channel_node, "schedule/intervaltime/@days").to_s update_frequency_hours = XPath.first(channel_node, "schedule/intervaltime/@hour").to_s update_frequency_minutes = XPath.first(channel_node, "schedule/intervaltime/@min").to_s update_frequency_seconds = XPath.first(channel_node, "schedule/intervaltime/@sec").to_s if update_frequency_days != "" @time_to_live = @time_to_live + update_frequency_days.to_i * 24 end if update_frequency_hours != "" @time_to_live = @time_to_live + update_frequency_hours.to_i * 1 end if update_frequency_minutes != "" @time_to_live = @time_to_live + update_frequency_minutes.to_i / 60 end if update_frequency_seconds != "" @time_to_live = @time_to_live + update_frequency_seconds.to_i / 3600 end if @time_to_live == 0 @time_to_live = nil end end if @time_to_live.nil? || @time_to_live == 0 # Default to one hour @time_to_live = 1 end @time_to_live = @time_to_live.round return @time_to_live.hour end # Sets the feed time to live def time_to_live=(new_time_to_live) @time_to_live = (new_time_to_live / 3600).round @time_to_live = 1 if @time_to_live < 1 end # Returns the feed generator def generator if @generator.nil? @generator = XPath.first(channel_node, "generator/text()").to_s @generator = FeedTools.strip_html(@generator) @generator = nil if @generator == "" end return @generator end # Sets the feed generator def generator=(new_generator) @generator = new_generator end # Returns the feed docs def docs if @docs.nil? @docs = XPath.first(channel_node, "docs/text()").to_s @docs = FeedTools.strip_html(@docs) @docs = nil if @docs == "" end return @docs end # Sets the feed docs def docs=(new_docs) @docs = new_docs end # Returns the feed language def language if @language.nil? @language = XPath.first(channel_node, "language/text()").to_s if @language == "" @language = XPath.first(channel_node, "dc:language/text()").to_s end if @language == "" @language = XPath.first(channel_node, "xml:lang/text()").to_s end if @language == "" @language = XPath.first(root_node, "xml:lang/text()").to_s end if @language == "" @language = "en-us" end @language = @language.downcase @language = nil if @language == "" end return @language end # Sets the feed language def language=(new_language) @language = new_language end # Returns true if this feed contains explicit material. def explicit if @explicit.nil? if XPath.first(channel_node, "media:adult/text()").to_s.downcase == "true" || XPath.first(channel_node, "itunes:explicit/text()").to_s.downcase == "yes" || XPath.first(channel_node, "itunes:explicit/text()").to_s.downcase == "true" @explicit = true else @explicit = false end end return @explicit end # Sets whether or not the feed contains explicit material def explicit=(new_explicit) @explicit = (new_explicit ? true : false) end # Returns the feed items def items if @items.nil? raw_items = XPath.match(root_node, "item") if raw_items == nil || raw_items == [] raw_items = XPath.match(channel_node, "item") end if raw_items == nil || raw_items == [] raw_items = XPath.match(channel_node, "entry") end # create the individual feed items @items = [] if raw_items != nil for item_node in raw_items new_item = FeedItem.new new_item.xml_data = item_node.to_s new_item.feed = self @items << new_item end end end # Sort the items @items = @items.sort do |a,b| (b.time or Time.mktime(1970)) <=> (a.time or Time.mktime(1970)) end return @items end # The time that the feed was last requested from the remote server. Nil if it has # never been pulled, or if it was created from scratch. def last_retrieved unless self.cache_object.nil? @last_retrieved = self.cache_object.last_retrieved end return @last_retrieved end # Sets the time that the feed was last updated. def last_retrieved=(new_last_retrieved) @last_retrieved = new_last_retrieved unless self.cache_object.nil? self.cache_object.last_retrieved = new_last_retrieved end end # True if this feed contains audio content enclosures def podcast? podcast = false $test_feed.items.each do |item| item.enclosures.each do |enclosure| podcast = true if enclosure.audio? end end return podcast end # True if this feed contains video content enclosures def vidlog? vidlog = false $test_feed.items.each do |item| item.enclosures.each do |enclosure| vidlog = true if enclosure.video? end end return vidlog end # True if this feed is malformed somehow def bozo? if @bozo.nil? @bozo = false end return @bozo end # True if the feed was not last retrieved from the cache. def live? return @live end # True if the feed has expired and must be reacquired from the remote server. def expired? return self.last_retrieved == nil || (self.last_retrieved + self.time_to_live.hour) < Time.now end # Forces this feed to expire. def expire self.last_retrieved = Time.mktime(1970) self.save end # A hook method that is called during the feed generation process. Overriding this method # will enable additional content to be inserted into the feed. def build_xml_hook(feed_type, version, xml_builder) return nil end # Generates xml based on the content of the feed def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2)) if feed_type == "rss" && version == 0.0 version = 1.0 elsif feed_type == "atom" && version == 0.0 version = 0.3 end if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1) # RDF-based rss format return xml_builder.tag!("rdf:RDF") do xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do unless title.nil? || title == "" xml_builder.title(title) else xml_builder.title end unless link.nil? || link == "" xml_builder.link(link) else xml_builder.link end unless image_link.nil? || image_link == "" xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link)) end unless description.nil? || description == "" xml_builder.description(description) else xml_builder.description end unless language.nil? || language == "" xml_builder.tag!("dc:language", language) end xml_builder.tag!("syn:updatePeriod", "hourly") xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s) xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601) xml_builder.items do xml_builder.tag!("rdf:Seq") do unless items.nil? for item in items if item.link.nil? raise "Cannot generate an rdf-based feed with a nil item link field." end xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link)) end end end end build_xml_hook(feed_type, version, xml_builder) end unless image_link.nil? || image_link == "" xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do unless title.nil? || title == "" xml_builder.title(title) else xml_builder.title end unless image_link.nil? || image_link == "" xml_builder.url(image_link) end unless link.nil? || link == "" xml_builder.link(link) else xml_builder.link end end end unless items.nil? for item in items item.build_xml(feed_type, version, xml_builder) end end end elsif feed_type == "rss" # normal rss format return xml_builder.rss("version" => version.to_s) do unless title.nil? || title == "" xml_builder.title(title) end unless link.nil? || link == "" xml_builder.link(link) end unless description.nil? || description == "" xml_builder.description(description) end xml_builder.ttl((time_to_live / 1.minute).to_s) xml_builder.generator("http://www.sporkmonger.com/projects/feedtools") build_xml_hook(feed_type, version, xml_builder) unless items.nil? for item in items item.build_xml(feed_type, version, xml_builder) end end end elsif feed_type == "atom" # normal atom format return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#", "version" => version.to_s, "xml:lang" => language) do unless title.nil? || title == "" xml_builder.title(title, "mode" => "escaped", "type" => "text/html") end unless link.nil? || link == "" xml_builder.link("href" => link, "rel" => "alternate", "type" => "text/html", "title" => title) end unless description.nil? || description == "" xml_builder.tagline(description, "mode" => "escaped", "type" => "text/html") end xml_builder.generator("FeedTools", "url" => "http://www.sporkmonger.com/projects/feedtools") build_xml_hook(feed_type, version, xml_builder) unless items.nil? for item in items item.build_xml(feed_type, version, xml_builder) end end end end end # Persists the current feed state to the cache. def save if FeedTools.feed_cache.nil? raise "Caching is currently disabled. Cannot save to cache." elsif self.url.nil? raise "The url field must be set to save to the cache." elsif self.xml_data.nil? raise "The xml_data field must be set to save to the cache." elsif self.cache_object.nil? raise "The cache_object is currently nil. Cannot save to cache." else self.cache_object.url = self.url self.cache_object.title = self.title self.cache_object.link = self.link self.cache_object.xml_data = self.xml_data unless self.http_response.nil? self.cache_object.http_headers = self.http_headers.to_yaml end self.cache_object.last_retrieved = self.last_retrieved self.cache_object.save end end alias_method :tagline, :description alias_method :tagline=, :description= alias_method :subtitle, :description alias_method :subtitle=, :description= alias_method :abstract, :description alias_method :abstract=, :description= alias_method :content, :description alias_method :content=, :description= alias_method :ttl, :time_to_live alias_method :ttl=, :time_to_live= alias_method :guid, :id alias_method :guid=, :id= alias_method :entries, :items # passes missing methods to the cache_object def method_missing(msg, *params) if self.cache_object.nil? raise NoMethodError, "Invalid method #{msg.to_s}" end return self.cache_object.send(msg, params) end # passes missing methods to the FeedTools.feed_cache def Feed.method_missing(msg, *params) if FeedTools.feed_cache.nil? raise NoMethodError, "Invalid method Feed.#{msg.to_s}" end result = FeedTools.feed_cache.send(msg, params) if result.kind_of? FeedTools.feed_cache result = Feed.open(result.url) end return result end end class FeedItem include REXML include AttributeDictionary # This class stores information about a feed item's file enclosures. class Enclosure include AttributeDictionary # The url for the enclosure attr_accessor :url # The MIME type of the file referenced by the enclosure attr_accessor :type # The size of the file referenced by the enclosure attr_accessor :file_size # The total play time of the file referenced by the enclosure attr_accessor :duration # The height in pixels of the enclosed media attr_accessor :height # The width in pixels of the enclosed media attr_accessor :width # The bitrate of the enclosed media attr_accessor :bitrate # The framerate of the enclosed media attr_accessor :framerate # The thumbnail for this enclosure attr_accessor :thumbnail # The categories for this enclosure attr_accessor :categories # A hash of the enclosed file attr_accessor :hash # A website containing some kind of media player instead of a direct # link to the media file. attr_accessor :player # A list of credits for the enclosed media attr_accessor :credits # A text rendition of the enclosed media attr_accessor :text # A list of alternate version of the enclosed media file attr_accessor :versions # The default version of the enclosed media file attr_accessor :default_version # Returns true if this is the default enclosure def is_default? return @is_default end # Sets whether this is the default enclosure for the media group def is_default=(new_is_default) @is_default = new_is_default end # Returns true if the enclosure contains explicit material def explicit? return @explicit end # Sets the explicit attribute on the enclosure def explicit=(new_explicit) @explicit = new_explicit end # Determines if the object is a sample, or the full version of the # object, or if it is a stream. # Possible values are 'sample', 'full', 'nonstop'. def expression return @expression end # Sets the expression attribute on the enclosure. # Allowed values are 'sample', 'full', 'nonstop'. def expression=(new_expression) unless ['sample', 'full', 'nonstop'].include? new_expression.downcase raise ArgumentError, "Permitted values are 'sample', 'full', 'nonstop'." end @expression = new_expression.downcase end # Returns true if this enclosure contains audio content def audio? unless self.type.nil? return true if (self.type =~ /^audio/) != nil end # TODO: create a more complete list # ================================= audio_extensions = ['mp3', 'm4a', 'm4p', 'wav', 'ogg', 'wma'] audio_extensions.each do |extension| if (url =~ /#{extension}$/) != nil return true end end return false end # Returns true if this enclosure contains video content def video? unless self.type.nil? return true if (self.type =~ /^video/) != nil return true if self.type == "image/mov" end # TODO: create a more complete list # ================================= video_extensions = ['mov', 'mp4', 'avi', 'wmv', 'asf'] video_extensions.each do |extension| if (url =~ /#{extension}$/) != nil return true end end return false end end EnclosureCategory = Struct.new( "EnclosureCategory", :category, :scheme, :label ) EnclosureHash = Struct.new( "EnclosureHash", :hash, :type ) EnclosurePlayer = Struct.new( "EnclosurePlayer", :url, :height, :width ) EnclosureCredit = Struct.new( "EnclosureCredit", :name, :role ) EnclosureThumbnail = Struct.new( "EnclosureThumbnail", :url, :height, :width ) # Returns the parent feed of this feed item def feed return @feed end # Sets the parent feed of this feed item def feed=(new_feed) @feed = new_feed end # Returns the feed item's raw xml data. def xml_data return @xml_data end # Sets the feed item's xml data. def xml_data=(new_xml_data) @xml_data = new_xml_data end # Returns a REXML Document of the xml_data def xml if @xml_doc.nil? @xml_doc = Document.new(xml_data) end return @xml_doc end # Returns the first node within the root_node that matches the xpath query. def find_node(xpath) return XPath.first(root_node, xpath) end # Returns all nodes within the root_node that match the xpath query. def find_all_nodes(xpath) return XPath.match(root_node, xpath) end # Returns the root node of the feed item. def root_node if @root_node.nil? @root_node = xml.root end return @root_node end # Returns the feed item title def title if @title.nil? if XPath.first(root_node, "title/@type").to_s == "xhtml" || XPath.first(root_node, "title/@mode").to_s == "xhtml" @title = XPath.first(root_node, "title").inner_xml elsif XPath.first(root_node, "title/@type").to_s == "escaped" || XPath.first(root_node, "title/@mode").to_s == "escaped" @title = CGI.unescapeHTML( XPath.first(root_node, "title/text()").to_s) else @title = CGI.unescapeHTML( XPath.first(root_node, "title/text()").to_s) end unless @title.nil? @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip)) end if @title != "" # Some blogging tools include the number of comments in a post # in the title... this is supremely ugly, and breaks any # applications which expect the title to be static, so we're # gonna strip them out. # # If for some incredibly wierd reason you need the actual # unstripped title, just use find_node("title/text()").to_s @title = FeedTools.strip_html( @title.strip.gsub(/\[\d*\]$/, "")).strip @title.gsub!(/\n/, " ") end @title = nil if @title == "" end return @title end # Sets the feed item title def title=(new_title) @title = new_title end # Returns the feed item description def description if @description.nil? # get the item content @description = "" body_node = XPath.first(root_node, "xhtml:body") if body_node == nil body_node = XPath.first(root_node, "body") end if body_node != nil @description = body_node.inner_xml end if @description == "" @description = CGI.unescapeHTML(XPath.first(root_node, "content:encoded/text()").to_s) end if @description == "" begin @description = XPath.first(root_node, "description").cdatas.first.to_s rescue @description = "" end if @description == "" @description = XPath.first(root_node, "description/text()").to_s end if @description != "" if XPath.first(root_node, "description/@encoding").to_s != "" # Not supported... yet. @description = "[Embedded data objects are not supported.]" else @description = CGI.unescapeHTML(@description) end end end if @description == "" @description = XPath.first(root_node, "content/text()").to_s if @description != "" && (XPath.first(root_node, "content/@mode").to_s == "escaped" || XPath.first(root_node, "content/@type").to_s == "escaped") @description = CGI.unescapeHTML(@description) end if XPath.first(root_node, "content/@mode").to_s == "xhtml" || XPath.first(root_node, "content/@type").to_s == "xhtml" @description = XPath.first(root_node, "content").inner_xml end end if @description == "" begin @description = XPath.first(root_node, "description").inner_xml rescue end end if @description == "" @description = self.itunes_summary @description = "" if @description.nil? end if @description == "" @description = self.itunes_subtitle @description = "" if @description.nil? end if @description == "" @description = self.media_text @description = "" if @description.nil? end unless @description.nil? @description = FeedTools.sanitize_html(@description) end # If it started with a bunch of divs, hack them right off. We can put # them back later if they're needed. @description.gsub!(/^(]*>)*/, "") @description.gsub!(/(<\/div>)*$/, "") @description.gsub!(/\n/, " ") if @description.size < 80 @description = @description.strip unless @description.nil? @description = nil if @description == "" end return @description end # Sets the feed item description def description=(new_description) @description = new_description end # Returns the feed item link def link if @link.nil? @link = XPath.first(root_node, "link[@rel='alternate']/@href").to_s if @link == "" @link = XPath.first(root_node, "link/@href").to_s end if @link == "" @link = XPath.first(root_node, "link/text()").to_s end if @link == "" @link = XPath.first(root_node, "@rdf:about").to_s end if @link == "" @link = XPath.first(root_node, "guid[@isPermaLink='true']/text()").to_s end if @link == "" if FeedTools.is_url? self.guid @link = self.guid end end if @link != "" @link = CGI.unescapeHTML(@link) end if @link != "" && (@link =~ /http:\/\//) != 0 && (@link =~ /https:\/\//) != 0 if (feed.base[-1..-1] == "/" && @link[0..0] == "/") @link = @link[1..-1] end # prepend the base to the link since they seem to have used a relative path @link = feed.base + @link end @link = FeedTools.normalize_url(@link) end return @link end # Sets the feed item link def link=(new_link) @link = new_link end # Returns the feed comment link def comment_link if @comment_link.nil? # get the feed comment link from the xml document @comment_link = XPath.first(root_node, "comments/text()").to_s if @comment_link == "" @comment_link = self.link end @comment_link = FeedTools.normalize_url(@comment_link) end return @comment_link end # Sets the feed comment link def comment_link=(new_comment_link) @comment_link = new_comment_link end # Returns the feed image link def image_link if @image_link.nil? # get the feed image link from the xml document if @image_link == "" @image_link = XPath.first(root_node, "link[@type='image/jpeg']/@href").to_s end if @image_link == "" @image_link = XPath.first(root_node, "link[@type='image/gif']/@href").to_s end if @image_link == "" @image_link = XPath.first(root_node, "link[@type='image/png']/@href").to_s end # The following two should technically never occur, but have been included # simply because I've seen both occuring in the wild at least once. if @image_link == "" @image_link = XPath.first(root_node, "image/url/text()").to_s end if @image_link == "" @image_link = XPath.first(root_node, "image/@rdf:resource").to_s end if @image_link == "" # If there's only a media thumbnail, we can just borrow it. Technically, this isn't # ideal, but chances are very good that anything that makes use of this image is # simply not going to care anyhow. @image_link = XPath.first(root_node, "media:thumbnail/@url").to_s if @image_link == "" @media_image_link = @image_link end end if @image_link == "" # If there's only an itunes image, we can just borrow it. See comment above regarding # less-than-ideal-ness. if @itunes_image_link == "" @image_link = XPath.first(root_node, "itunes:image/@href").to_s if @image_link == "" @image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s end @itunes_image_link = @image_link else @image_link = @itunes_image_link end end @image_link = FeedTools.normalize_url(@image_link) end return @image_link end # Sets the feed image link def image_link=(new_image_link) @image_link = new_image_link end # Returns the feed item itunes image link # # If it's not present, falls back to the normal image link. # Technically, the itunes spec says that the image needs to be # square and larger than 300x300, but hey, if there's an image # to be had, it's better than none at all. def itunes_image_link if @itunes_image_link.nil? # get the feed item itunes image link from the xml document @itunes_image_link = XPath.first(root_node, "itunes:image/@href").to_s if @itunes_image_link == "" @itunes_image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s end if @itunes_image_link == "" @itunes_image_link = self.image_link end @itunes_image_link = FeedTools.normalize_url(@itunes_image_link) end return @itunes_image_link end # Sets the feed item itunes image link def itunes_image_link=(new_itunes_image_link) @itunes_image_link = new_itunes_image_link end # Returns the feed item media thumbnail link # # If it's not present, falls back to the normal image link. def media_thumbnail_link if @media_thumbnail_link.nil? # get the feed item itunes image link from the xml document @media_thumbnail_link = XPath.first(root_node, "media:thumbnail/@url").to_s if @media_thumbnail_link == "" @media_thumbnail_link = image_link end @media_thumbnail_link = FeedTools.normalize_url(@media_thumbnail_link) end return @media_thumbnail_link end # Sets the feed item media thumbnail url def media_thumbnail_link=(new_media_thumbnail_link) @media_thumbnail_link = new_media_thumbnail_link end # Returns the feed items's unique id def id if @id.nil? @id = XPath.first(root_node, "id/text()").to_s if @id == "" @id = XPath.first(root_node, "guid/text()").to_s end @id = nil if @id == "" end return @id end # Sets the feed item's unique id def id=(new_id) @id = new_id end # Returns all feed item enclosures def enclosures if @enclosures.nil? @enclosures = [] # First, load up all the different possible sources of enclosures rss_enclosures = XPath.match(root_node, "enclosure") atom_enclosures = XPath.match(root_node, "link[@rel='enclosure']") media_content_enclosures = XPath.match(root_node, "media:content") media_group_enclosures = XPath.match(root_node, "media:group") # Parse RSS-type enclosures. Thanks to a few buggy enclosures implementations, # sometimes these also manage to show up in atom files. for enclosure_node in rss_enclosures enclosure = Enclosure.new enclosure.url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s) enclosure.type = enclosure_node.attributes["type"].to_s enclosure.file_size = enclosure_node.attributes["length"].to_i enclosure.credits = [] enclosure.explicit = false @enclosures << enclosure end # Parse atom-type enclosures. If there are repeats of the same enclosure object, # we merge the two together. for enclosure_node in atom_enclosures enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["href"].to_s) enclosure = nil new_enclosure = false for existing_enclosure in @enclosures if existing_enclosure.url == enclosure_url enclosure = existing_enclosure break end end if enclosure.nil? new_enclosure = true enclosure = Enclosure.new end enclosure.url = enclosure_url enclosure.type = enclosure_node.attributes["type"].to_s enclosure.file_size = enclosure_node.attributes["length"].to_i enclosure.credits = [] enclosure.explicit = false if new_enclosure @enclosures << enclosure end end # Creates an anonymous method to parse content objects from the media module. We # do this to avoid excessive duplication of code since we have to do identical # processing for content objects within group objects. parse_media_content = lambda do |media_content_nodes| affected_enclosures = [] for enclosure_node in media_content_nodes enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s) enclosure = nil new_enclosure = false for existing_enclosure in @enclosures if existing_enclosure.url == enclosure_url enclosure = existing_enclosure break end end if enclosure.nil? new_enclosure = true enclosure = Enclosure.new end enclosure.url = enclosure_url enclosure.type = enclosure_node.attributes["type"].to_s enclosure.file_size = enclosure_node.attributes["fileSize"].to_i enclosure.duration = enclosure_node.attributes["duration"].to_s enclosure.height = enclosure_node.attributes["height"].to_i enclosure.width = enclosure_node.attributes["width"].to_i enclosure.bitrate = enclosure_node.attributes["bitrate"].to_i enclosure.framerate = enclosure_node.attributes["framerate"].to_i enclosure.expression = enclosure_node.attributes["expression"].to_s enclosure.is_default = (enclosure_node.attributes["isDefault"].to_s.downcase == "true") if XPath.first(enclosure_node, "media:thumbnail/@url").to_s != "" enclosure.thumbnail = EnclosureThumbnail.new( CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@url").to_s), CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@height").to_s), CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@width").to_s) ) if enclosure.thumbnail.height == "" enclosure.thumbnail.height = nil end if enclosure.thumbnail.width == "" enclosure.thumbnail.width = nil end end enclosure.categories = [] for category in XPath.match(enclosure_node, "media:category") enclosure.categories << EnclosureCategory.new( CGI.unescapeHTML(category.text), CGI.unescapeHTML(category.attributes["scheme"].to_s), CGI.unescapeHTML(category.attributes["label"].to_s) ) if enclosure.categories.last.scheme == "" enclosure.categories.last.scheme = nil end if enclosure.categories.last.label == "" enclosure.categories.last.label = nil end end if XPath.first(enclosure_node, "media:hash/text()").to_s != "" enclosure.hash = EnclosureHash.new( FeedTools.sanitize_html(CGI.unescapeHTML(XPath.first( enclosure_node, "media:hash/text()").to_s), :strip), "md5" ) end if XPath.first(enclosure_node, "media:player/@url").to_s != "" enclosure.player = EnclosurePlayer.new( CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@url").to_s), CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@height").to_s), CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@width").to_s) ) if enclosure.player.height == "" enclosure.player.height = nil end if enclosure.player.width == "" enclosure.player.width = nil end end enclosure.credits = [] for credit in XPath.match(enclosure_node, "media:credit") enclosure.credits << EnclosureCredit.new( CGI.unescapeHTML(CGI.unescapeHTML(credit.text)), CGI.unescapeHTML(credit.attributes["role"].to_s.downcase) ) if enclosure.credits.last.role == "" enclosure.credits.last.role = nil end end enclosure.explicit = (XPath.first(enclosure_node, "media:adult/text()").to_s.downcase == "true") if XPath.first(enclosure_node, "media:text/text()").to_s != "" enclosure.text = CGI.unescapeHTML(XPath.first(enclosure_node, "media:text/text()").to_s) end affected_enclosures << enclosure if new_enclosure @enclosures << enclosure end end affected_enclosures end # Parse the independant content objects. parse_media_content.call(media_content_enclosures) media_groups = [] # Parse the group objects. for media_group in media_group_enclosures group_media_content_enclosures = XPath.match(media_group, "media:content") # Parse the content objects within the group objects. affected_enclosures = parse_media_content.call(group_media_content_enclosures) # Now make sure that content objects inherit certain properties from # the group objects. for enclosure in affected_enclosures if enclosure.thumbnail.nil? && XPath.first(media_group, "media:thumbnail/@url").to_s != "" enclosure.thumbnail = EnclosureThumbnail.new( CGI.unescapeHTML( XPath.first(media_group, "media:thumbnail/@url").to_s), CGI.unescapeHTML( XPath.first(media_group, "media:thumbnail/@height").to_s), CGI.unescapeHTML( XPath.first(media_group, "media:thumbnail/@width").to_s) ) if enclosure.thumbnail.height == "" enclosure.thumbnail.height = nil end if enclosure.thumbnail.width == "" enclosure.thumbnail.width = nil end end if (enclosure.categories.nil? || enclosure.categories.size == 0) enclosure.categories = [] for category in XPath.match(media_group, "media:category") enclosure.categories << EnclosureCategory.new( CGI.unescapeHTML(category.text), CGI.unescapeHTML(category.attributes["scheme"].to_s), CGI.unescapeHTML(category.attributes["label"].to_s) ) if enclosure.categories.last.scheme == "" enclosure.categories.last.scheme = nil end if enclosure.categories.last.label == "" enclosure.categories.last.label = nil end end end if enclosure.hash.nil? && XPath.first(media_group, "media:hash/text()").to_s != "" enclosure.hash = EnclosureHash.new( CGI.unescapeHTML(XPath.first(media_group, "media:hash/text()").to_s), "md5" ) end if enclosure.player.nil? && XPath.first(media_group, "media:player/@url").to_s != "" enclosure.player = EnclosurePlayer.new( CGI.unescapeHTML(XPath.first(media_group, "media:player/@url").to_s), CGI.unescapeHTML(XPath.first(media_group, "media:player/@height").to_s), CGI.unescapeHTML(XPath.first(media_group, "media:player/@width").to_s) ) if enclosure.player.height == "" enclosure.player.height = nil end if enclosure.player.width == "" enclosure.player.width = nil end end if enclosure.credits.nil? || enclosure.credits.size == 0 enclosure.credits = [] for credit in XPath.match(media_group, "media:credit") enclosure.credits << EnclosureCredit.new( CGI.unescapeHTML(CGI.unescapeHTML(credit.text)), CGI.unescapeHTML(credit.attributes["role"].to_s.downcase) ) if enclosure.credits.last.role == "" enclosure.credits.last.role = nil end end end if enclosure.explicit?.nil? enclosure.explicit = (XPath.first(media_group, "media:adult/text()").to_s.downcase == "true") ? true : false end if enclosure.text.nil? && XPath.first(media_group, "media:text/text()").to_s != "" enclosure.text = FeedTools.sanitize_html(CGI.unescapeHTML( XPath.first(media_group, "media:text/text()").to_s), :strip) end end # Keep track of the media groups media_groups << affected_enclosures end # Now we need to inherit any relevant item level information. if self.explicit? for enclosure in @enclosures enclosure.explicit = true end end # Add all the itunes categories for itunes_category in XPath.match(root_node, "itunes:category") genre = "Podcasts" category = itunes_category.attributes["text"].to_s subcategory = XPath.first(itunes_category, "itunes:category/@text").to_s category_path = genre if category != "" category_path << "/" + category end if subcategory != "" category_path << "/" + subcategory end for enclosure in @enclosures if enclosure.categories.nil? enclosure.categories = [] end enclosure.categories << EnclosureCategory.new( CGI.unescapeHTML(category_path), CGI.unescapeHTML("http://www.apple.com/itunes/store/"), CGI.unescapeHTML("iTunes Music Store Categories") ) end end for enclosure in @enclosures # Clean up any of those attributes that incorrectly have "" # or 0 as their values if enclosure.type == "" enclosure.type = nil end if enclosure.file_size == 0 enclosure.file_size = nil end if enclosure.duration == 0 enclosure.duration = nil end if enclosure.height == 0 enclosure.height = nil end if enclosure.width == 0 enclosure.width = nil end if enclosure.bitrate == 0 enclosure.bitrate = nil end if enclosure.framerate == 0 enclosure.framerate = nil end if enclosure.expression == "" || enclosure.expression.nil? enclosure.expression = "full" end # If an enclosure is missing the text field, fall back on the itunes:summary field if enclosure.text.nil? || enclosure.text = "" enclosure.text = self.itunes_summary end # Make sure we don't have duplicate categories unless enclosure.categories.nil? enclosure.categories.uniq! end end # And finally, now things get complicated. This is where we make # sure that the enclosures method only returns either default # enclosures or enclosures with only one version. Any enclosures # that are wrapped in a media:group will be placed in the appropriate # versions field. affected_enclosure_urls = [] for media_group in media_groups affected_enclosure_urls = affected_enclosure_urls | (media_group.map do |enclosure| enclosure.url end) end @enclosures.delete_if do |enclosure| (affected_enclosure_urls.include? enclosure.url) end for media_group in media_groups default_enclosure = nil for enclosure in media_group if enclosure.is_default? default_enclosure = enclosure end end for enclosure in media_group enclosure.default_version = default_enclosure enclosure.versions = media_group.clone enclosure.versions.delete(enclosure) end @enclosures << default_enclosure end end # If we have a single enclosure, it's safe to inherit the itunes:duration field # if it's missing. if @enclosures.size == 1 if @enclosures.first.duration.nil? || @enclosures.first.duration == 0 @enclosures.first.duration = self.duration end end return @enclosures end def enclosures=(new_enclosures) @enclosures = new_enclosures end # Returns the feed item author def author_name # TODO: make this not suck, actually ensure we're looking at a name # and not an email address. # Also, factor in itunes module. # ================================================================= if @author_name.nil? @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/name/text()").to_s) if @author_name == "" @author_name = CGI.unescapeHTML(XPath.first(root_node, "dc:creator/text()").to_s) end if @author_name == "" @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/text()").to_s) end end return @author_name end # Sets the feed item author def author_name=(new_author_name) @author_name = new_author_name end # Returns the contents of the itunes:summary element def itunes_summary if @itunes_summary.nil? @itunes_summary = CGI.unescapeHTML(XPath.first(root_node, "itunes:summary/text()").to_s) if @itunes_summary == "" @itunes_summary = nil end unless @itunes_summary.nil? @itunes_summary = FeedTools.sanitize_html(@itunes_summary) end end return @itunes_summary end # Sets the contents of the itunes:summary element def itunes_summary=(new_itunes_summary) @itunes_summary = new_itunes_summary end # Returns the contents of the itunes:subtitle element def itunes_subtitle if @itunes_subtitle.nil? @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node, "itunes:subtitle/text()").to_s) if @itunes_subtitle == "" @itunes_subtitle = nil end unless @itunes_subtitle.nil? @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle) end end return @itunes_subtitle end # Sets the contents of the itunes:subtitle element def itunes_subtitle=(new_itunes_subtitle) @itunes_subtitle = new_itunes_subtitle end # Returns the contents of the media:text element def media_text if @media_text.nil? @media_text = CGI.unescapeHTML(XPath.first(root_node, "itunes:subtitle/text()").to_s) if @media_text == "" @media_text = nil end unless @media_text.nil? @media_text = FeedTools.sanitize_html(@media_text) end end return @media_text end # Sets the contents of the media:text element def media_text=(new_media_text) @media_text = new_media_text end # Returns the contents of the itunes:author element # # This inherits from any incorrectly placed channel-level itunes:author # elements. They're actually amazingly commong. People don't read specs. def itunes_author if @itunes_author.nil? @itunes_author = CGI.unescapeHTML(XPath.first(root_node, "itunes:author/text()").to_s) if @itunes_author == "" @itunes_author = CGI.unescapeHTML(XPath.first(feed.channel_node, "itunes:author/text()").to_s) end if @itunes_author == "" @itunes_author = nil end end return @itunes_author end # Sets the contents of the itunes:author element def itunes_author=(new_itunes_author) @itunes_author = new_itunes_author end # Returns the number of seconds that the associated media runs for def duration if @duration.nil? itunes_duration = CGI.unescapeHTML(XPath.first(root_node, "itunes:duration/text()").to_s) if itunes_duration != "" hms = itunes_duration.split(":").map { |x| x.to_i } if hms.size == 3 @duration = hms[0].hour + hms[1].minute + hms[2] elsif hms.size == 2 @duration = hms[0].minute + hms[1] elsif hms.size == 1 @duration = hms[0] end end end return @duration end # Sets the number of seconds that the associate media runs for def duration=(new_duration) @duration = new_duration end # Sets the itunes:summary def itunes_summary=(new_itunes_summary) end # Returns the feed item time def time if @time.nil? time_string = XPath.first(root_node, "pubDate/text()").to_s if time_string == "" time_string = XPath.first(root_node, "dc:date/text()").to_s end if time_string == "" time_string = XPath.first(root_node, "issued/text()").to_s end if time_string != "" @time = Time.parse(time_string) rescue Time.now elsif time_string == nil @time = Time.now end end return @time end # Sets the feed item time def time=(new_time) @time = new_time end # Returns the feed item tags def tags # TODO: support the rel="tag" microformat # ======================================= if @tags.nil? @tags = [] if @tags.nil? || @tags.size == 0 @tags = [] tag_list = XPath.match(root_node, "dc:subject/rdf:Bag/rdf:li/text()") if tag_list.size > 1 for tag in tag_list @tags << tag.to_s.downcase.strip end end end if @tags.nil? || @tags.size == 0 # messy effort to find ourselves some tags, mainly for del.icio.us @tags = [] rdf_bag = XPath.match(root_node, "taxo:topics/rdf:Bag/rdf:li") if rdf_bag != nil && rdf_bag.size > 0 for tag_node in rdf_bag begin tag_url = XPath.first(root_node, "@resource").to_s tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/) if tag_match.size > 0 @tags << tag_match.first.last.downcase.strip end rescue end end end end if @tags.nil? || @tags.size == 0 @tags = [] tag_list = XPath.match(root_node, "category/text()") for tag in tag_list @tags << tag.to_s.downcase.strip end end if @tags.nil? || @tags.size == 0 @tags = [] tag_list = XPath.match(root_node, "dc:subject/text()") for tag in tag_list @tags << tag.to_s.downcase.strip end end if @tags.nil? || @tags.size == 0 begin @tags = XPath.first(root_node, "itunes:keywords/text()").to_s.downcase.split(" ") rescue @tags = [] end end if @tags.nil? @tags = [] end @tags.uniq! end return @tags end # Sets the feed item tags def tags=(new_tags) @tags = new_tags end # Returns true if this feed item contains explicit material. If the whole # feed has been marked as explicit, this will return true even if the item # isn't explicitly marked as explicit. def explicit? if @explicit.nil? if XPath.first(root_node, "media:adult/text()").to_s.downcase == "true" || XPath.first(root_node, "itunes:explicit/text()").to_s.downcase == "yes" || XPath.first(root_node, "itunes:explicit/text()").to_s.downcase == "true" || feed.explicit @explicit = true else @explicit = false end end return @explicit end # Sets whether or not the feed contains explicit material def explicit=(new_explicit) @explicit = (new_explicit ? true : false) end # A hook method that is called during the feed generation process. Overriding this method # will enable additional content to be inserted into the feed. def build_xml_hook(feed_type, version, xml_builder) return nil end # Generates xml based on the content of the feed item def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2)) if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1) # RDF-based rss format if link.nil? raise "Cannot generate an rdf-based feed item with a nil link field." end return xml_builder.item("rdf:about" => CGI.escapeHTML(link)) do unless title.nil? || title == "" xml_builder.title(title) else xml_builder.title end unless link.nil? || link == "" xml_builder.link(link) else xml_builder.link end unless description.nil? || description == "" xml_builder.description(description) else xml_builder.description end unless time.nil? xml_builder.tag!("dc:date", time.iso8601) end unless tags.nil? || tags.size == 0 xml_builder.tag!("dc:subject") do xml_builder.tag!("rdf:Bag") do for tag in tags xml_builder.tag!("rdf:li", tag) end end end xml_builder.tag!("itunes:keywords", tags.join(" ")) end build_xml_hook(feed_type, version, xml_builder) end elsif feed_type == "rss" # normal rss format return xml_builder.item do unless title.nil? || title == "" xml_builder.title(title) end unless link.nil? || link == "" xml_builder.link(link) end unless description.nil? || description == "" xml_builder.description(description) end unless time.nil? xml_builder.pubDate(time.rfc822) end unless tags.nil? || tags.size == 0 xml_builder.tag!("dc:subject") do xml_builder.tag!("rdf:Bag") do for tag in tags xml_builder.tag!("rdf:li", tag) end end end xml_builder.tag!("itunes:keywords", tags.join(" ")) end build_xml_hook(feed_type, version, xml_builder) end elsif feed_type == "atom" # normal atom format return xml_builder.entry("xmlns" => "http://purl.org/atom/ns#") do unless title.nil? || title == "" xml_builder.title(title, "mode" => "escaped", "type" => "text/html") end unless link.nil? || link == "" xml_builder.link("href" => link, "rel" => "alternate", "type" => "text/html", "title" => title) end unless description.nil? || description == "" xml_builder.content(description, "mode" => "escaped", "type" => "text/html") end unless time.nil? xml_builder.issued(time.iso8601) end unless tags.nil? || tags.size == 0 for tag in tags xml_builder.category(tag) end end build_xml_hook(feed_type, version, xml_builder) end end end alias_method :tagline, :description alias_method :tagline=, :description= alias_method :subtitle, :description alias_method :subtitle=, :description= alias_method :abstract, :description alias_method :abstract=, :description= alias_method :content, :description alias_method :content=, :description= alias_method :guid, :id alias_method :guid=, :id= end end module REXML #:nodoc: class Element #:nodoc: def inner_xml #:nodoc: result = "" self.each_child do |child| result << child.to_s end return result end end end begin unless FeedTools.feed_cache.nil? FeedTools.feed_cache.initialize_cache end rescue end