feed_item.rb in feedtools-0.2.18

- old
+ new

@@ -1,10 +1,37 @@
+#--
+# Copyright (c) 2005 Robert Aman
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+
 module FeedTools
   # The <tt>FeedTools::FeedItem</tt> class represents the structure of
   # a single item within a web feed.
   class FeedItem
+    # :stopdoc:
     include REXML
+    include GenericHelper
+    private :validate_options
+    # :startdoc:
     
     # This class stores information about a feed item's file enclosures.
     class Enclosure
       # The url for the enclosure
       attr_accessor :url
@@ -123,66 +150,54 @@
       :width )
     
     # Initialize the feed object
     def initialize
       super
-      @feed = nil
       @feed_data = nil
       @feed_data_type = :xml
       @xml_doc = nil
       @root_node = nil
       @title = nil
       @id = nil
       @time = Time.now.gmtime
     end
 
     # Returns the parent feed of this feed item
+    # Warning, this method may be slow if you have a
+    # large number of FeedTools::Feed objects.  Can't
+    # use a direct reference to the parent because it plays
+    # havoc with the garbage collector.
     def feed
-      return @feed
+      parent_feed = nil
+      ObjectSpace.each_object(FeedTools::Feed) do |feed|
+        if feed.instance_variable_get("@items").nil?
+          feed.items
+        end
+        unsorted_items = feed.instance_variable_get("@items")
+        for item in unsorted_items
+          if item.object_id == self.object_id
+            if parent_feed.nil?
+              parent_feed = feed
+              break
+            else
+              raise "Multiple parent feeds found."
+            end
+          end
+        end
+      end
+      return parent_feed
     end
     
-    # Sets the parent feed of this feed item
-    def feed=(new_feed)
-      @feed = new_feed
-    end
-
     # Returns the feed item's raw data.
     def feed_data
       return @feed_data
     end
 
     # Sets the feed item's data.
     def feed_data=(new_feed_data)
       @time = nil
       @feed_data = new_feed_data
-      
-      # We need an immediate parse of the time so we don't mess up sort orders
-      unless root_node.nil?
-        repair_entities = false
-        time_node = XPath.first(root_node, "pubDate")
-        if time_node.nil?
-          time_node = XPath.first(root_node, "dc:date")
-        end
-        if time_node.nil?
-          time_node = XPath.first(root_node, "dc:date", FEED_TOOLS_NAMESPACES)
-        end
-        if time_node.nil?
-          time_node = XPath.first(root_node, "issued")
-        end
-        if time_node.nil?
-          time_node = XPath.first(root_node, "updated")
-        end
-        if time_node.nil?
-          time_node = XPath.first(root_node, "time")
-        end
-      end
-      unless time_node.nil?
-        begin
-          @time = Time.parse(time_node.inner_xml)
-        rescue
-        end
-      end
     end
 
     # Returns the feed item's data type.
     def feed_data_type
       return @feed_data_type
@@ -251,34 +266,51 @@
     # Returns the feed item title
     def title
       if @title.nil?
         unless root_node.nil?
           repair_entities = false
-          title_node = XPath.first(root_node, "title")
+          title_node = XPath.first(root_node, "atom10:title",
+            FEED_TOOLS_NAMESPACES)
           if title_node.nil?
+            title_node = XPath.first(root_node, "title")
+          end
+          if title_node.nil?
+            title_node = XPath.first(root_node, "atom03:title",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if title_node.nil?
             title_node = XPath.first(root_node, "atom:title")
           end
           if title_node.nil?
+            title_node = XPath.first(root_node, "dc:title",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if title_node.nil?
             title_node = XPath.first(root_node, "dc:title")
           end
           if title_node.nil?
             title_node = XPath.first(root_node, "TITLE")
           end
         end
         if title_node.nil?
           return nil
         end
-        if XPath.first(title_node, "@type").to_s == "xhtml" || 
-            XPath.first(title_node, "@mode").to_s == "xhtml" ||
-            XPath.first(title_node, "@type").to_s == "xml" || 
-            XPath.first(title_node, "@mode").to_s == "xml" ||
-            XPath.first(title_node, "@type").to_s == "application/xhtml+xml"
+        title_type = XPath.first(title_node, "@type").to_s
+        title_mode = XPath.first(title_node, "@mode").to_s
+        title_encoding = XPath.first(title_node, "@encoding").to_s
+        
+        # Note that we're checking for misuse of type, mode and encoding here
+        if title_type == "base64" || title_mode == "base64" ||
+            title_encoding == "base64"
+          @title = Base64.decode64(title_node.inner_xml.strip)
+        elsif title_type == "xhtml" || title_mode == "xhtml" ||
+            title_type == "xml" || title_mode == "xml" ||
+            title_type == "application/xhtml+xml"
           @title = title_node.inner_xml
-        elsif XPath.first(title_node, "@type").to_s == "escaped" ||
-            XPath.first(title_node, "@mode").to_s == "escaped"
+        elsif title_type == "escaped" || title_mode == "escaped"
           @title = FeedTools.unescape_entities(
-            XPath.first(title_node, "text()").to_s)
+            title_node.inner_xml)
         else
           @title = title_node.inner_xml
           repair_entities = true
         end
         unless @title.nil?
@@ -364,31 +396,33 @@
           end
         end
         if description_node.nil?
           return nil
         end
-        unless description_node.nil?
-          if XPath.first(description_node, "@encoding").to_s != ""
-            @description =
-              "[Embedded data objects are not currently supported.]"
-          elsif description_node.cdatas.size > 0
-            @description = description_node.cdatas.first.value
-          elsif XPath.first(description_node, "@type").to_s == "xhtml" || 
-              XPath.first(description_node, "@mode").to_s == "xhtml" ||
-              XPath.first(description_node, "@type").to_s == "xml" || 
-              XPath.first(description_node, "@mode").to_s == "xml" ||
-              XPath.first(description_node, "@type").to_s ==
-                "application/xhtml+xml"
-            @description = description_node.inner_xml
-          elsif XPath.first(description_node, "@type").to_s == "escaped" ||
-              XPath.first(description_node, "@mode").to_s == "escaped"
-            @description = FeedTools.unescape_entities(
-              description_node.inner_xml)
-          else
-            @description = description_node.inner_xml
-            repair_entities = true
-          end
+        description_type = XPath.first(description_node, "@type").to_s
+        description_mode = XPath.first(description_node, "@mode").to_s
+        description_encoding = XPath.first(description_node, "@encoding").to_s
+
+        # Note that we're checking for misuse of type, mode and encoding here
+        if description_encoding != ""
+          @description =
+            "[Embedded data objects are not currently supported.]"
+        elsif description_node.cdatas.size > 0
+          @description = description_node.cdatas.first.value
+        elsif description_type == "base64" || description_mode == "base64" ||
+            description_encoding == "base64"
+          @description = Base64.decode64(description_node.inner_xml.strip)
+        elsif description_type == "xhtml" || description_mode == "xhtml" ||
+            description_type == "xml" || description_mode == "xml" ||
+            description_type == "application/xhtml+xml"
+          @description = description_node.inner_xml
+        elsif description_type == "escaped" || description_mode == "escaped"
+          @description = FeedTools.unescape_entities(
+            description_node.inner_xml)
+        else
+          @description = description_node.inner_xml
+          repair_entities = true
         end
         if @description == ""
           @description = self.itunes_summary
           @description = "" if @description.nil?
         end
@@ -664,23 +698,74 @@
 
     # Returns the feed item's copyright information
     def copyright
       if @copyright.nil?
         unless root_node.nil?
-          @copyright = XPath.first(root_node, "dc:rights/text()").to_s
-          if @copyright == ""
-            @copyright = XPath.first(root_node, "rights/text()").to_s
+          repair_entities = false
+
+          copyright_node = XPath.first(root_node, "dc:rights")
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "dc:rights",
+              FEED_TOOLS_NAMESPACES)
           end
-          if @copyright == ""
-            @copyright = XPath.first(root_node, "copyright/text()").to_s
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "rights",
+              FEED_TOOLS_NAMESPACES)
           end
-          if @copyright == ""
-            @copyright = XPath.first(root_node, "copyrights/text()").to_s
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "copyright",
+              FEED_TOOLS_NAMESPACES)
           end
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "atom03:copyright",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "atom10:copyright",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if copyright_node.nil?
+            copyright_node = XPath.first(root_node, "copyrights",
+              FEED_TOOLS_NAMESPACES)
+          end
+        end
+        if copyright_node.nil?
+          return nil
+        end
+        copyright_type = XPath.first(copyright_node, "@type").to_s
+        copyright_mode = XPath.first(copyright_node, "@mode").to_s
+        copyright_encoding = XPath.first(copyright_node, "@encoding").to_s
+
+        # Note that we're checking for misuse of type, mode and encoding here
+        if copyright_encoding != ""
+          @copyright =
+            "[Embedded data objects are not currently supported.]"
+        elsif copyright_node.cdatas.size > 0
+          @copyright = copyright_node.cdatas.first.value
+        elsif copyright_type == "base64" || copyright_mode == "base64" ||
+            copyright_encoding == "base64"
+          @copyright = Base64.decode64(copyright_node.inner_xml.strip)
+        elsif copyright_type == "xhtml" || copyright_mode == "xhtml" ||
+            copyright_type == "xml" || copyright_mode == "xml" ||
+            copyright_type == "application/xhtml+xml"
+          @copyright = copyright_node.inner_xml
+        elsif copyright_type == "escaped" || copyright_mode == "escaped"
+          @copyright = FeedTools.unescape_entities(
+            copyright_node.inner_xml)
+        else
+          @copyright = copyright_node.inner_xml
+          repair_entities = true
+        end
+
+        unless @copyright.nil?
           @copyright = FeedTools.sanitize_html(@copyright, :strip)
-          @copyright = nil if @copyright == ""
+          @copyright = FeedTools.unescape_entities(@copyright) if repair_entities
+          @copyright = FeedTools.tidy_html(@copyright)
         end
+
+        @copyright = @copyright.strip unless @copyright.nil?
+        @copyright = nil if @copyright == ""
       end
       return @copyright
     end
 
     # Sets the feed item's copyright information
@@ -961,15 +1046,17 @@
           end          
           for enclosure in @enclosures
             if enclosure.categories.nil?
               enclosure.categories = []
             end
-            enclosure.categories << EnclosureCategory.new(
-              FeedTools.unescape_entities(category_path),
-              FeedTools.unescape_entities("http://www.apple.com/itunes/store/"),
-              FeedTools.unescape_entities("iTunes Music Store Categories")
-            )
+            enclosure.categories << FeedTools::Feed::Category.new
+            enclosure.categories.last.term =
+              FeedTools.unescape_entities(category_path)
+            enclosure.categories.last.scheme =
+              "http://www.apple.com/itunes/store/"
+            enclosure.categories.last.label =
+              "iTunes Music Store Categories"
           end
         end
 
         for enclosure in @enclosures
           # Clean up any of those attributes that incorrectly have ""
@@ -1059,22 +1146,38 @@
     # Returns the feed item author
     def author
       if @author.nil?
         @author = FeedTools::Feed::Author.new
         unless root_node.nil?
-          author_node = XPath.first(root_node, "author")
+          author_node = XPath.first(root_node, "atom10:author",
+            FEED_TOOLS_NAMESPACES)
           if author_node.nil?
+            author_node = XPath.first(root_node, "atom03:author",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if author_node.nil?
+            author_node = XPath.first(root_node, "atom:author")
+          end
+          if author_node.nil?
+            author_node = XPath.first(root_node, "author")
+          end
+          if author_node.nil?
             author_node = XPath.first(root_node, "managingEditor")
           end
           if author_node.nil?
+            author_node = XPath.first(root_node, "dc:author",
+              FEED_TOOLS_NAMESPACES)
+          end
+          if author_node.nil?
             author_node = XPath.first(root_node, "dc:author")
           end
           if author_node.nil?
-            author_node = XPath.first(root_node, "dc:creator")
+            author_node = XPath.first(root_node, "dc:creator",
+              FEED_TOOLS_NAMESPACES)
           end
           if author_node.nil?
-            author_node = XPath.first(root_node, "atom:author")
+            author_node = XPath.first(root_node, "dc:creator")
           end
         end
         unless author_node.nil?
           @author.raw = FeedTools.unescape_entities(
             XPath.first(author_node, "text()").to_s)
@@ -1278,11 +1381,14 @@
     def itunes_duration=(new_itunes_duration)
       @itunes_duration = new_itunes_duration
     end
     
     # Returns the feed item time
-    def time
+    def time(options = {})
+      validate_options([ :estimate_timestamp ],
+                       options.keys)
+      options = { :estimate_timestamp => true }.merge(options)
       if @time.nil?
         unless root_node.nil?
           time_string = XPath.first(root_node, "pubDate/text()").to_s
           if time_string == ""
             time_string = XPath.first(root_node, "dc:date/text()").to_s
@@ -1298,26 +1404,28 @@
           end
         end
         begin
           time_string = "" if time_string.nil?
           if time_string != ""
-            @time = Time.parse(time_string)
-          else
-            @time = succ_time
-            if @time.nil?
-              @time = prev_time
-            end
+            @time = Time.parse(time_string).gmtime
           end
         rescue
-          @time = succ_time
+        end
+        if options[:estimate_timestamp]
           if @time.nil?
-            @time = prev_time
+            begin
+              @time = succ_time
+              if @time.nil?
+                @time = prev_time
+              end
+            rescue
+            end
+            if @time.nil?
+              @time = Time.now.gmtime
+            end
           end
         end
-        if @time.nil?
-          @time = Time.now.gmtime
-        end
       end
       return @time
     end
     
     # Sets the feed item time
@@ -1326,51 +1434,53 @@
     end
     
     # Returns 1 second after the previous item's time.
     def succ_time #:nodoc:
       begin
-        if feed.nil?
+        parent_feed = self.feed
+        if parent_feed.nil?
           return nil
         end
-        if feed.instance_variable_get("@items").nil?
-          feed.items
+        if parent_feed.instance_variable_get("@items").nil?
+          parent_feed.items
         end
-        unsorted_items = feed.instance_variable_get("@items")
+        unsorted_items = parent_feed.instance_variable_get("@items")
         item_index = unsorted_items.index(self)
         if item_index.nil?
           return nil
         end
         if item_index <= 0
           return nil
         end
         previous_item = unsorted_items[item_index - 1]
-        return (previous_item.time + 1)
+        return (previous_item.time(:estimate_timestamp => false) + 1)
       rescue
         return nil
       end
     end
     #private :succ_time
 
     # Returns 1 second before the succeeding item's time.
     def prev_time #:nodoc:
       begin
-        if feed.nil?
+        parent_feed = self.feed
+        if parent_feed.nil?
           return nil
         end
-        if feed.instance_variable_get("@items").nil?
-          feed.items
+        if parent_feed.instance_variable_get("@items").nil?
+          parent_feed.items
         end
-        unsorted_items = feed.instance_variable_get("@items")
+        unsorted_items = parent_feed.instance_variable_get("@items")
         item_index = unsorted_items.index(self)
         if item_index.nil?
           return nil
         end
         if item_index >= (unsorted_items.size - 1)
           return nil
         end
         succeeding_item = unsorted_items[item_index + 1]
-        return (succeeding_item.time - 1)
+        return (succeeding_item.time(:estimate_timestamp => false) - 1)
       rescue
         return nil
       end
     end
     #private :prev_time
@@ -1383,11 +1493,11 @@
           if updated_string == ""
             updated_string = XPath.first(root_node, "modified/text()").to_s
           end
         end
         if updated_string != nil && updated_string != ""
-          @updated = Time.parse(updated_string) rescue nil
+          @updated = Time.parse(updated_string).gmtime rescue nil
         else
           @updated = nil
         end
       end
       return @updated
@@ -1412,11 +1522,11 @@
           if issued_string == ""
             issued_string = XPath.first(root_node, "dc:date/text()").to_s
           end
         end
         if issued_string != nil && issued_string != ""
-          @issued = Time.parse(issued_string) rescue nil
+          @issued = Time.parse(issued_string).gmtime rescue nil
         else
           @issued = nil
         end
       end
       return @issued
@@ -1629,11 +1739,12 @@
           end
           build_xml_hook(feed_type, version, xml_builder)
         end
       elsif feed_type == "atom" && version == 0.3
         # normal atom format
-        return xml_builder.entry("xmlns" => "http://purl.org/atom/ns#") do
+        return xml_builder.entry("xmlns" =>
+            FEED_TOOLS_NAMESPACES['atom03']) do
           unless title.nil? || title == ""
             xml_builder.title(title,
                 "mode" => "escaped",
                 "type" => "text/html")
           end
@@ -1671,10 +1782,11 @@
           end
           build_xml_hook(feed_type, version, xml_builder)
         end
       elsif feed_type == "atom" && version == 1.0
         # normal atom format
-        return xml_builder.entry("xmlns" => "http://www.w3.org/2005/Atom") do
+        return xml_builder.entry("xmlns" =>
+            FEED_TOOLS_NAMESPACES['atom10']) do
           unless title.nil? || title == ""
             xml_builder.title(title,
                 "type" => "html")
           end
           xml_builder.author do