#-- # Copyright (c) 2005 Robert Aman # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #++ if Object.const_defined?(:FEED_TOOLS_ENV) warn("FeedTools may have been loaded improperly. This may be caused " + "by the presence of the RUBYOPT environment variable or by using " + "load instead of require. This can also be caused by missing " + "the Iconv library, which is common on Windows.") end FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production' # :nodoc: FEED_TOOLS_VERSION = "0.2.11" $:.unshift(File.dirname(__FILE__)) $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor") begin require 'iconv' rescue LoadError warn("The Iconv library does not appear to be installed properly. " + "FeedTools cannot function properly without it.") raise end require 'rubygems' begin require 'builder' rescue LoadError # RubyGems version is not available, use included Builder require 'feed_tools/vendor/builder' end begin require 'tidy' rescue LoadError # Ignore the error for now. end require 'feed_tools/vendor/htree' require 'net/http' require 'net/https' require 'net/ftp' require 'rexml/document' require 'uri' require 'time' require 'cgi' require 'pp' require 'yaml' require_gem('activerecord', '>= 1.10.1') require_gem('uuidtools', '>= 0.1.2') require 'feed_tools/feed' require 'feed_tools/feed_item' require 'feed_tools/database_feed_cache' #= feed_tools.rb # # FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in # caching system. # #== Example # slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss') # slashdot_feed.title # => "Slashdot" # slashdot_feed.description # => "News for nerds, stuff that matters" # slashdot_feed.link # => "http://slashdot.org/" # slashdot_feed.items.first.find_node("slash:hitparade/text()").value # => "43,37,28,23,11,3,1" module FeedTools @force_tidy_enabled = true @tidy_enabled = false @feed_cache = DatabaseFeedCache @user_agent = "FeedTools/#{FEED_TOOLS_VERSION} " + "+http://www.sporkmonger.com/projects/feedtools/" @no_content_string = "[no description]" # Error raised when a feed cannot be retrieved class FeedAccessError < StandardError end # Returns the current caching mechanism. def FeedTools.feed_cache return @feed_cache end # Sets the current caching mechanism. If set to nil, disables caching. # Default is the DatabaseFeedCache class. # # Objects of this class must accept the following messages: # url # url= # title # title= # link # link= # xml_data # xml_data= # etag # etag= # last_modified # last_modified= # save # # Additionally, the class itself must accept the following messages: # find_by_id # find_by_url # initialize_cache # connected? def FeedTools.feed_cache=(new_feed_cache) # TODO: ensure that the feed cache class actually does those things. # ================================================================== @feed_cache = new_feed_cache end # Returns true if FeedTools.feed_cache is not nil and a connection with # the cache has been successfully established. Also returns false if an # error is raised while trying to determine the status of the cache. def FeedTools.feed_cache_connected? begin return false if FeedTools.feed_cache.nil? return FeedTools.feed_cache.connected? rescue return false end end # Returns the currently used user agent string. def FeedTools.user_agent return @user_agent end # Sets the user agent string to send in the http headers. def FeedTools.user_agent=(new_user_agent) @user_agent = new_user_agent end # Returns the currently used no content string. def FeedTools.no_content_string return @no_content_string end # Sets the no content string to use when a feed is missing a content element. # Used only for xml output. def FeedTools.no_content_string=(new_no_content_string) @no_content_string = new_no_content_string end # Returns true if the html tidy module can be used. # # Obviously, you need the tidy gem installed in order to run with html # tidy features turned on. # # This method does a fairly complicated, and probably unnecessarily # desperate search for the libtidy library. If you want this thing to # execute fast, the best thing to do is to set Tidy.path ahead of time. # If Tidy.path is set, this method doesn't do much. If it's not set, # it will do it's darnedest to find the libtidy library. If you set # the LIBTIDYPATH environment variable to the libtidy library, it should # be able to find it. # # Once the library is located, this method will run much faster. def FeedTools.tidy_enabled? # This is an override variable to keep tidy from being used even if it # is available. if @force_tidy_enabled == false return false end if @tidy_enabled.nil? || @tidy_enabled == false @tidy_enabled = false begin require 'tidy' if Tidy.path.nil? # *Shrug*, just brute force it, I guess. There's a lot of places # this thing might be hiding in, depending on platform and general # sanity of the person who installed the thing. Most of these are # probably unlikely, but it's not like checking unlikely locations # hurts. Much. Especially if you actually find it. libtidy_locations = [ '/usr/local/lib/libtidy.dylib', '/opt/local/lib/libtidy.dylib', '/usr/lib/libtidy.dylib', '/usr/local/lib/tidylib.dylib', '/opt/local/lib/tidylib.dylib', '/usr/lib/tidylib.dylib', '/usr/local/lib/tidy.dylib', '/opt/local/lib/tidy.dylib', '/usr/lib/tidy.dylib', '/usr/local/lib/libtidy.so', '/opt/local/lib/libtidy.so', '/usr/lib/libtidy.so', '/usr/local/lib/tidylib.so', '/opt/local/lib/tidylib.so', '/usr/lib/tidylib.so', '/usr/local/lib/tidy.so', '/opt/local/lib/tidy.so', '/usr/lib/tidy.so', 'C:\Program Files\Tidy\tidy.dll', 'C:\Tidy\tidy.dll', 'C:\Ruby\bin\tidy.dll', 'C:\Ruby\tidy.dll', '/usr/local/lib', '/opt/local/lib', '/usr/lib' ] # We just made this thing up, but if someone sets it, we'll # go ahead and check it unless ENV['LIBTIDYPATH'].nil? libtidy_locations = libtidy_locations.reverse.push(ENV['LIBTIDYPATH']) end for path in libtidy_locations if File.exists? path if File.ftype(path) == "file" Tidy.path = path @tidy_enabled = true break elsif File.ftype(path) == "directory" # Ok, now perhaps we're getting a bit more desperate lib_paths = `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'` # If there's more than one, grab the first one and # hope for the best, and if it doesn't work, then blame the # user for not specifying more accurately. tidy_path = lib_paths.split("\n").first unless tidy_path.nil? Tidy.path = tidy_path @tidy_enabled = true break end end end end # Still couldn't find it. unless @tidy_enabled @tidy_enabled = false end else @tidy_enabled = true end rescue LoadError # Tidy not installed, disable features that rely on tidy. @tidy_enabled = false end end return @tidy_enabled end # Turns html tidy support on or off. Be aware, that setting this to true # does not mean tidy will be enabled. It simply means that tidy will be # enabled if it is available to be enabled. def FeedTools.tidy_enabled=(new_tidy_enabled) @force_tidy_enabled = new_tidy_enabled end # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls # and makes every effort to figure out what it was supposed to be. Also translates from # the feed: and rss: pseudo-protocols to the http: protocol. def FeedTools.normalize_url(url) if url.nil? || url == "" return nil end normalized_url = url.strip # if a url begins with the '/' character, it only makes sense that they # meant to be using a file:// url. Fix it for them. if normalized_url.length > 0 && normalized_url[0..0] == "/" normalized_url = "file://" + normalized_url end # if a url begins with javascript:, it's quite possibly an attempt at # doing something malicious. Let's keep that from getting anywhere, # shall we? if (normalized_url.downcase =~ /javascript:/) != nil return "#" end # deal with all of the many ugly possibilities involved in the rss: # and feed: pseudo-protocols (incidentally, whose crazy idea was this # mess?) normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://") normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://") normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://") normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://") normalized_url.gsub!(/^file:\/*/, "file:///") normalized_url.gsub!(/^https:\/*/, "https://") # fix (very) bad urls (usually of the user-entered sort) normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://") if (normalized_url =~ /^file:/) == 0 # fix bad Windows-based entries normalized_url.gsub!(/file:\/\/\/([a-zA-Z]):/, 'file:///\1|') # maybe this is too aggressive? normalized_url.gsub!(/\\/, '/') return normalized_url else if (normalized_url =~ /https?:\/\//) == nil normalized_url = "http://" + normalized_url end if normalized_url == "http://" return nil end begin feed_uri = URI.parse(normalized_url) if feed_uri.scheme == nil feed_uri.scheme = "http" end if feed_uri.path == nil || feed_uri.path == "" feed_uri.path = "/" end if (feed_uri.path =~ /^[\/]+/) == 0 feed_uri.path.gsub!(/^[\/]+/, "/") end feed_uri.host.downcase! return feed_uri.to_s rescue URI::InvalidURIError return normalized_url end end end # Converts a url into a tag uri def FeedTools.build_tag_uri(url, date) unless url.kind_of? String raise ArgumentError, "Expected String, got #{url.class.name}" end unless date.kind_of? Time raise ArgumentError, "Expected Time, got #{date.class.name}" end tag_uri = normalize_url(url) unless FeedTools.is_uri?(tag_uri) raise ArgumentError, "Must supply a valid URL." end host = URI.parse(tag_uri).host tag_uri.gsub!(/^(http|ftp|file):\/*/, "") tag_uri.gsub!(/#/, "/") tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" + "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}" return tag_uri end # Converts a url into a urn:uuid: uri def FeedTools.build_urn_uri(url) unless url.kind_of? String raise ArgumentError, "Expected String, got #{url.class.name}" end normalized_url = normalize_url(url) require 'uuidtools' return UUID.sha1_create(normalized_url).to_uri_string end # Returns true if the parameter appears to be a valid uri def FeedTools.is_uri?(url) return false if url.nil? begin uri = URI.parse(url) if uri.scheme.nil? || uri.scheme == "" return false end rescue URI::InvalidURIError return false end return true end # Escapes all html entities def FeedTools.escape_entities(html) return nil if html.nil? escaped_html = CGI.escapeHTML(html) unescaped_html.gsub!(/'/, "'") unescaped_html.gsub!(/"/, """) return escaped_html end # Unescapes all html entities def FeedTools.unescape_entities(html) return nil if html.nil? unescaped_html = html unescaped_html.gsub!(/&/, "&") unescaped_html.gsub!(/&/, "&") unescaped_html = CGI.unescapeHTML(unescaped_html) unescaped_html.gsub!(/'/, "'") unescaped_html.gsub!(/"/, "\"") return unescaped_html end # Removes all html tags from the html formatted text. def FeedTools.strip_html(html) return nil if html.nil? # TODO: do this properly # ====================== stripped_html = html.gsub(/<\/?[^>]+>/, "") return stripped_html end # Tidys up the html def FeedTools.tidy_html(html) return nil if html.nil? if FeedTools.tidy_enabled? is_fragment = true html.gsub!(/<!'/, "<!'") if (html.strip =~ /(.|\n)*
/) != nil || (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil is_fragment = false end if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil is_fragment = false end tidy_html = Tidy.open(:show_warnings=>false) do |tidy| tidy.options.output_xml = true tidy.options.indent = false tidy.options.wrap_attributes = true tidy.options.logical_emphasis = true tidy.options.doctype = "omit" xml = tidy.clean(html) xml end if is_fragment # Tidy sticks ...[our html]... in. # We don't want this. tidy_html.strip! tidy_html.gsub!(/^(.|\n)*/, "") tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "") tidy_html.strip! end tidy_html.gsub!(/&/, "&") tidy_html.gsub!(/&/, "&") else tidy_html = html end return tidy_html end # Removes all dangerous html tags from the html formatted text. # If mode is set to :escape, dangerous and unknown elements will # be escaped. If mode is set to :strip, dangerous and unknown # elements and all children will be removed entirely. # Dangerous or unknown attributes are always removed. def FeedTools.sanitize_html(html, mode=:strip) return nil if html.nil? # Lists borrowed from Mark Pilgrim's feedparser acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var'] acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'] # Replace with appropriate named entities html.gsub!(/&/, "&") html.gsub!(/&/, "&") html.gsub!(/<!'/, "<!'") # Hackity hack. But it works, and it seems plenty fast enough. html_doc = HTree.parse_xml("