lib/spidr/page.rb in spidr-0.2.4 vs lib/spidr/page.rb in spidr-0.2.5

- old
+ new

@@ -60,19 +60,22 @@ alias ok? is_ok? # # Determines if the response code is `300`, `301`, `302`, `303` - # or `307`. + # or `307`. Also checks for "soft" redirects added at the page + # level by a meta refresh tag. # # @return [Boolean] # Specifies whether the response code is a HTTP Redirect code. # def is_redirect? case code when 300..303, 307 true + when 200 + meta_redirect? else false end end @@ -432,22 +435,12 @@ add_url = lambda { |url| urls << url unless (url.nil? || url.empty?) } - if self.is_redirect? - location = @headers['location'] + self.redirects_to.each(&add_url) if self.is_redirect? - if location.kind_of?(Array) - # handle multiple location URLs - location.each(&add_url) - else - # usually the location header contains a single String - add_url.call(location) - end - end - if (html? && doc) doc.search('a[@href]').each do |a| add_url.call(a.get_attribute('href')) end @@ -470,10 +463,31 @@ return urls end # + # URL(s) that this document redirects to. + # + # @return [Array<String>] + # The links that this page redirects to (usually found in a + # location header or by way of a page-level meta redirect). + # + def redirects_to + location = @headers['location'] + + if location.nil? + # check page-level meta redirects if there isn't a location header + meta_redirect + elsif location.kind_of?(Array) + location + else + # usually the location header contains a single String + [location] + end + end + + # # Absolute URIs from within the page. # # @return [Array<URI::HTTP>] # The links from within the page, converted to absolute URIs. # @@ -503,9 +517,46 @@ # "/stuff/../" url.path = URI.expand_path(url.path) end return url + end + + # + # Determines if a page-level "soft" redirect is present. If yes, + # returns an array of those redirects (usually a single URL). + # Otherwise, returns false. + # + # @return [Array<String>] + # An array of redirect URLs + # + def meta_redirect + redirects = [] + + if (html? && doc) + search('//meta[@http-equiv and @content]').each do |node| + if node.attr('http-equiv') =~ /refresh/i + content = node.attr('content') + + if (redirect = content.match(/url=(\S+)$/)) + redirects << redirect[1] + end + end + end + end + + return redirects.uniq + end + + # + # Returns a boolean indicating whether or not page-level meta + # redirects are present in this page. + # + # @return [Boolean] + # Specifies whether the page includes page-level redirects. + # + def meta_redirect? + !meta_redirect.empty? end protected #