lib/feed_tools/helpers/uri_helper.rb in feedtools-0.2.26 vs lib/feed_tools/helpers/uri_helper.rb in feedtools-0.2.27

- old
+ new

@@ -57,18 +57,38 @@ # Attempts to ensures that the passed url is valid and sane. Accepts very, # very ugly urls and makes every effort to figure out what it was supposed # to be. Also translates from the feed: and rss: pseudo-protocols to the # http: protocol. def self.normalize_url(url) - if url.kind_of?(URI) + if url.nil? + return nil + end + if !url.kind_of?(String) url = url.to_s end if url.blank? - return nil + return "" end - normalized_url = CGI.unescape(url.strip) + normalized_url = url.strip + begin + normalized_url = + FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s + rescue Exception + end + + begin + begin + normalized_url = + FeedTools::URI.parse(normalized_url.strip).normalize.to_s + rescue Exception + normalized_url = CGI.unescape(url.strip) + end + rescue Exception + normalized_url = url.strip + end + # if a url begins with the '/' character, it only makes sense that they # meant to be using a file:// url. Fix it for them. if normalized_url.length > 0 && normalized_url[0..0] == "/" normalized_url = "file://" + normalized_url end @@ -88,89 +108,61 @@ end # deal with all of the many ugly possibilities involved in the rss: # and feed: pseudo-protocols (incidentally, whose crazy idea was this # mess?) + normalized_url.gsub!(/^htp:\/*/i, "http://") normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://") normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://") normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://") normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://") normalized_url.gsub!(/^file:\/*/i, "file:///") normalized_url.gsub!(/^https:\/*/i, "https://") + normalized_url.gsub!(/^mms:\/*/i, "http://") # fix (very) bad urls (usually of the user-entered sort) normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://") + normalized_url.gsub!(/^http:\/*$/i, "") if (normalized_url =~ /^file:/i) == 0 # Adjust windows-style urls normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:') normalized_url.gsub!(/\\/, '/') else - if (normalized_url =~ /^https?:\/\//i) == nil + if FeedTools::URI.parse(normalized_url).scheme == nil && + normalized_url =~ /\./ && normalized_url = "http://" + normalized_url end if normalized_url == "http://" return nil end - begin - scheme, host_part, path = - normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten - if scheme != nil && host_part != nil && path != nil - scheme = scheme.downcase - if FeedTools::UriHelper.idn_enabled? - host_part = - IDN::Idna.toASCII(host_part) - end - new_path = "" - for index in 0...path.size - if path[index] <= 32 || path[index] >= 126 - new_path << ("%" + path[index].to_s(16).upcase) - else - new_path << path[index..index] - end - end - path = new_path - normalized_url = scheme + "://" + host_part + "/" + path - end - rescue Object - end - begin - feed_uri = URI.parse(normalized_url) - if feed_uri.scheme == nil - feed_uri.scheme = "http" - end - if feed_uri.path.blank? - feed_uri.path = "/" - end - if (feed_uri.path =~ /^[\/]+/) == 0 - feed_uri.path.gsub!(/^[\/]+/, "/") - end - while (feed_uri.path =~ /^\/\.\./) - feed_uri.path.gsub!(/^\/\.\./, "") - end - if feed_uri.path.blank? - feed_uri.path = "/" - end - feed_uri.host.downcase! - normalized_url = feed_uri.to_s - rescue URI::InvalidURIError - end end + if normalized_url =~ /^https?:\/\/#/i + normalized_url.gsub!(/^https?:\/\/#/i, "#") + end + if normalized_url =~ /^https?:\/\/\?/i + normalized_url.gsub!(/^https?:\/\/\?/i, "?") + end - # We can't do a proper set of escaping, so this will - # have to do. - normalized_url.gsub!(/%20/, " ") - normalized_url.gsub!(/ /, "%20") - + normalized_url = + FeedTools::URI.parse(normalized_url.strip).normalize.to_s return normalized_url end # Resolves a relative uri def self.resolve_relative_uri(relative_uri, base_uri_sources=[]) return relative_uri if base_uri_sources.blank? return nil if relative_uri.nil? begin - base_uri = URI.parse( + # Massive HACK to get around file protocol URIs being used to + # resolve relative URIs on feeds in the local file system. + # Better to leave these URIs unresolved and hope some other + # tool resolves them correctly. + base_uri_sources.reject! do |base_uri| + base_uri == nil || + FeedTools::URI.parse(base_uri).scheme == "file" + end + base_uri = FeedTools::URI.parse( FeedTools::XmlHelper.select_not_blank(base_uri_sources)) resolved_uri = base_uri if relative_uri.to_s != '' resolved_uri = base_uri + relative_uri.to_s end @@ -205,10 +197,10 @@ unless url.kind_of? String raise ArgumentError, "Expected String, got #{url.class.name}" end normalized_url = normalize_url(url) require 'uuidtools' - return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string + return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s end # Returns true if the parameter appears to be a valid uri def self.is_uri?(url) return false if url.nil? \ No newline at end of file