lib/spidr/page.rb in spidr-0.2.4 vs lib/spidr/page.rb in spidr-0.2.5
- old
+ new
@@ -60,19 +60,22 @@
alias ok? is_ok?
#
# Determines if the response code is `300`, `301`, `302`, `303`
- # or `307`.
+ # or `307`. Also checks for "soft" redirects added at the page
+ # level by a meta refresh tag.
#
# @return [Boolean]
# Specifies whether the response code is a HTTP Redirect code.
#
def is_redirect?
case code
when 300..303, 307
true
+ when 200
+ meta_redirect?
else
false
end
end
@@ -432,22 +435,12 @@
add_url = lambda { |url|
urls << url unless (url.nil? || url.empty?)
}
- if self.is_redirect?
- location = @headers['location']
+ self.redirects_to.each(&add_url) if self.is_redirect?
- if location.kind_of?(Array)
- # handle multiple location URLs
- location.each(&add_url)
- else
- # usually the location header contains a single String
- add_url.call(location)
- end
- end
-
if (html? && doc)
doc.search('a[@href]').each do |a|
add_url.call(a.get_attribute('href'))
end
@@ -470,10 +463,31 @@
return urls
end
#
+ # URL(s) that this document redirects to.
+ #
+ # @return [Array<String>]
+ # The links that this page redirects to (usually found in a
+ # location header or by way of a page-level meta redirect).
+ #
+ def redirects_to
+ location = @headers['location']
+
+ if location.nil?
+ # check page-level meta redirects if there isn't a location header
+ meta_redirect
+ elsif location.kind_of?(Array)
+ location
+ else
+ # usually the location header contains a single String
+ [location]
+ end
+ end
+
+ #
# Absolute URIs from within the page.
#
# @return [Array<URI::HTTP>]
# The links from within the page, converted to absolute URIs.
#
@@ -503,9 +517,46 @@
# "/stuff/../"
url.path = URI.expand_path(url.path)
end
return url
+ end
+
+ #
+ # Determines if a page-level "soft" redirect is present. If yes,
+ # returns an array of those redirects (usually a single URL).
+ # Otherwise, returns false.
+ #
+ # @return [Array<String>]
+ # An array of redirect URLs
+ #
+ def meta_redirect
+ redirects = []
+
+ if (html? && doc)
+ search('//meta[@http-equiv and @content]').each do |node|
+ if node.attr('http-equiv') =~ /refresh/i
+ content = node.attr('content')
+
+ if (redirect = content.match(/url=(\S+)$/))
+ redirects << redirect[1]
+ end
+ end
+ end
+ end
+
+ return redirects.uniq
+ end
+
+ #
+ # Returns a boolean indicating whether or not page-level meta
+ # redirects are present in this page.
+ #
+ # @return [Boolean]
+ # Specifies whether the page includes page-level redirects.
+ #
+ def meta_redirect?
+ !meta_redirect.empty?
end
protected
#