lib/feed_tools/helpers/uri_helper.rb in feedtools-0.2.26 vs lib/feed_tools/helpers/uri_helper.rb in feedtools-0.2.27
- old
+ new
@@ -57,18 +57,38 @@
# Attempts to ensures that the passed url is valid and sane. Accepts very,
# very ugly urls and makes every effort to figure out what it was supposed
# to be. Also translates from the feed: and rss: pseudo-protocols to the
# http: protocol.
def self.normalize_url(url)
- if url.kind_of?(URI)
+ if url.nil?
+ return nil
+ end
+ if !url.kind_of?(String)
url = url.to_s
end
if url.blank?
- return nil
+ return ""
end
- normalized_url = CGI.unescape(url.strip)
+ normalized_url = url.strip
+ begin
+ normalized_url =
+ FeedTools::URI.convert_path(normalized_url.strip).normalize.to_s
+ rescue Exception
+ end
+
+ begin
+ begin
+ normalized_url =
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
+ rescue Exception
+ normalized_url = CGI.unescape(url.strip)
+ end
+ rescue Exception
+ normalized_url = url.strip
+ end
+
# if a url begins with the '/' character, it only makes sense that they
# meant to be using a file:// url. Fix it for them.
if normalized_url.length > 0 && normalized_url[0..0] == "/"
normalized_url = "file://" + normalized_url
end
@@ -88,89 +108,61 @@
end
# deal with all of the many ugly possibilities involved in the rss:
# and feed: pseudo-protocols (incidentally, whose crazy idea was this
# mess?)
+ normalized_url.gsub!(/^htp:\/*/i, "http://")
normalized_url.gsub!(/^http:\/*(feed:\/*)?/i, "http://")
normalized_url.gsub!(/^http:\/*(rss:\/*)?/i, "http://")
normalized_url.gsub!(/^feed:\/*(http:\/*)?/i, "http://")
normalized_url.gsub!(/^rss:\/*(http:\/*)?/i, "http://")
normalized_url.gsub!(/^file:\/*/i, "file:///")
normalized_url.gsub!(/^https:\/*/i, "https://")
+ normalized_url.gsub!(/^mms:\/*/i, "http://")
# fix (very) bad urls (usually of the user-entered sort)
normalized_url.gsub!(/^http:\/*(http:\/*)*/i, "http://")
+ normalized_url.gsub!(/^http:\/*$/i, "")
if (normalized_url =~ /^file:/i) == 0
# Adjust windows-style urls
normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/i, 'file:///\1:')
normalized_url.gsub!(/\\/, '/')
else
- if (normalized_url =~ /^https?:\/\//i) == nil
+ if FeedTools::URI.parse(normalized_url).scheme == nil &&
+ normalized_url =~ /\./ &&
normalized_url = "http://" + normalized_url
end
if normalized_url == "http://"
return nil
end
- begin
- scheme, host_part, path =
- normalized_url.scan(/^(https?):\/\/([^\/]+)\/(.*)/i).flatten
- if scheme != nil && host_part != nil && path != nil
- scheme = scheme.downcase
- if FeedTools::UriHelper.idn_enabled?
- host_part =
- IDN::Idna.toASCII(host_part)
- end
- new_path = ""
- for index in 0...path.size
- if path[index] <= 32 || path[index] >= 126
- new_path << ("%" + path[index].to_s(16).upcase)
- else
- new_path << path[index..index]
- end
- end
- path = new_path
- normalized_url = scheme + "://" + host_part + "/" + path
- end
- rescue Object
- end
- begin
- feed_uri = URI.parse(normalized_url)
- if feed_uri.scheme == nil
- feed_uri.scheme = "http"
- end
- if feed_uri.path.blank?
- feed_uri.path = "/"
- end
- if (feed_uri.path =~ /^[\/]+/) == 0
- feed_uri.path.gsub!(/^[\/]+/, "/")
- end
- while (feed_uri.path =~ /^\/\.\./)
- feed_uri.path.gsub!(/^\/\.\./, "")
- end
- if feed_uri.path.blank?
- feed_uri.path = "/"
- end
- feed_uri.host.downcase!
- normalized_url = feed_uri.to_s
- rescue URI::InvalidURIError
- end
end
+ if normalized_url =~ /^https?:\/\/#/i
+ normalized_url.gsub!(/^https?:\/\/#/i, "#")
+ end
+ if normalized_url =~ /^https?:\/\/\?/i
+ normalized_url.gsub!(/^https?:\/\/\?/i, "?")
+ end
- # We can't do a proper set of escaping, so this will
- # have to do.
- normalized_url.gsub!(/%20/, " ")
- normalized_url.gsub!(/ /, "%20")
-
+ normalized_url =
+ FeedTools::URI.parse(normalized_url.strip).normalize.to_s
return normalized_url
end
# Resolves a relative uri
def self.resolve_relative_uri(relative_uri, base_uri_sources=[])
return relative_uri if base_uri_sources.blank?
return nil if relative_uri.nil?
begin
- base_uri = URI.parse(
+ # Massive HACK to get around file protocol URIs being used to
+ # resolve relative URIs on feeds in the local file system.
+ # Better to leave these URIs unresolved and hope some other
+ # tool resolves them correctly.
+ base_uri_sources.reject! do |base_uri|
+ base_uri == nil ||
+ FeedTools::URI.parse(base_uri).scheme == "file"
+ end
+ base_uri = FeedTools::URI.parse(
FeedTools::XmlHelper.select_not_blank(base_uri_sources))
resolved_uri = base_uri
if relative_uri.to_s != ''
resolved_uri = base_uri + relative_uri.to_s
end
@@ -205,10 +197,10 @@
unless url.kind_of? String
raise ArgumentError, "Expected String, got #{url.class.name}"
end
normalized_url = normalize_url(url)
require 'uuidtools'
- return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
+ return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri.to_s
end
# Returns true if the parameter appears to be a valid uri
def self.is_uri?(url)
return false if url.nil?
\ No newline at end of file