lib/spidr/page.rb in spidr-0.2.3 vs lib/spidr/page.rb in spidr-0.2.4
- old
+ new
@@ -59,17 +59,23 @@
end
alias ok? is_ok?
#
- # Determines if the response code is `301` or `307`.
+ # Determines if the response code is `300`, `301`, `302`, `303`
+ # or `307`.
#
# @return [Boolean]
- # Specifies whether the response code is `301` or `307`.
+ # Specifies whether the response code is a HTTP Redirect code.
#
def is_redirect?
- (code == 301 || code == 307)
+ case code
+ when 300..303, 307
+ true
+ else
+ false
+ end
end
alias redirect? is_redirect?
#
@@ -143,11 +149,11 @@
#
# @return [String]
# The Content-Type of the page.
#
def content_type
- @response['Content-Type']
+ (@response['Content-Type'] || '')
end
#
# The content types of the page.
#
@@ -155,21 +161,21 @@
# The values within the Content-Type header.
#
# @since 0.2.2
#
def content_types
- @headers['content-type']
+ (@headers['content-type'] || [])
end
#
# Determines if the page is plain-text.
#
# @return [Boolean]
# Specifies whether the page is plain-text.
#
def plain_text?
- content_types.include?('text/plain')
+ is_content_type?('text/plain')
end
alias txt? plain_text?
#
@@ -177,103 +183,103 @@
#
# @return [Boolean]
# Specifies whether the page is HTML document.
#
def html?
- content_types.include?('text/html')
+ is_content_type?('text/html')
end
#
# Determines if the page is XML document.
#
# @return [Boolean]
# Specifies whether the page is XML document.
#
def xml?
- content_types.include?('text/xml')
+ is_content_type?('text/xml')
end
#
# Determines if the page is XML Stylesheet (XSL).
#
# @return [Boolean]
# Specifies whether the page is XML Stylesheet (XSL).
#
def xsl?
- content_types.include?('text/xsl')
+ is_content_type?('text/xsl')
end
#
# Determines if the page is JavaScript.
#
# @return [Boolean]
# Specifies whether the page is JavaScript.
#
def javascript?
- content_types.include?('text/javascript') || \
- content_types.include?('application/javascript')
+ is_content_type?('text/javascript') || \
+ is_content_type?('application/javascript')
end
#
# Determines if the page is a CSS stylesheet.
#
# @return [Boolean]
# Specifies whether the page is a CSS stylesheet.
#
def css?
- content_types.include?('text/css')
+ is_content_type?('text/css')
end
#
# Determines if the page is a RSS feed.
#
# @return [Boolean]
# Specifies whether the page is a RSS feed.
#
def rss?
- content_types.include?('application/rss+xml') || \
- content_types.include?('application/rdf+xml')
+ is_content_type?('application/rss+xml') || \
+ is_content_type?('application/rdf+xml')
end
#
# Determines if the page is an Atom feed.
#
# @return [Boolean]
# Specifies whether the page is an Atom feed.
#
def atom?
- content_types.include?('application/atom+xml')
+ is_content_type?('application/atom+xml')
end
#
# Determines if the page is a MS Word document.
#
# @return [Boolean]
# Specifies whether the page is a MS Word document.
#
def ms_word?
- content_types.include?('application/msword')
+ is_content_type?('application/msword')
end
#
# Determines if the page is a PDF document.
#
# @return [Boolean]
# Specifies whether the page is a PDF document.
#
def pdf?
- content_types.include?('application/pdf')
+ is_content_type?('application/pdf')
end
#
# Determines if the page is a ZIP archive.
#
# @return [Boolean]
# Specifies whether the page is a ZIP archive.
#
def zip?
- content_types.include?('application/zip')
+ is_content_type?('application/zip')
end
#
# The raw Cookie String sent along with the page.
#
@@ -327,11 +333,11 @@
#
# @return [String]
# The body of the response.
#
def body
- @response.body
+ (@response.body || '')
end
#
# Returns a parsed document object for HTML, XML, RSS and Atom pages.
#
@@ -342,11 +348,11 @@
#
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
#
def doc
- return nil if (body.nil? || body.empty?)
+ return nil if body.empty?
begin
if html?
return @doc ||= Nokogiri::HTML(body)
elsif (xml? || xsl? || rss? || atom?)
@@ -373,14 +379,14 @@
#
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
#
def search(*paths)
if doc
- return doc.search(*paths)
+ doc.search(*paths)
+ else
+ []
end
-
- return []
end
#
# Searches for the first occurrence an XPath or CSS Path expression.
#
@@ -393,14 +399,12 @@
#
# @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
#
def at(*arguments)
if doc
- return doc.at(*arguments)
+ doc.at(*arguments)
end
-
- return nil
end
alias / search
alias % at
@@ -410,11 +414,11 @@
# @return [String]
# The inner-text of the title element of the page.
#
def title
if (node = at('//title'))
- return node.inner_text
+ node.inner_text
end
end
#
# The links from within the page.
@@ -428,12 +432,11 @@
add_url = lambda { |url|
urls << url unless (url.nil? || url.empty?)
}
- case code
- when 300..303, 307
+ if self.is_redirect?
location = @headers['location']
if location.kind_of?(Array)
# handle multiple location URLs
location.each(&add_url)
@@ -505,10 +508,26 @@
end
protected
#
+ # Determines if any of the content-types of the page include a given
+ # type.
+ #
+ # @param [String] type
+ # The content-type to test for.
+ #
+ # @return [Boolean]
+ # Specifies whether the page includes the given content-type.
+ #
+ # @since 0.2.4
+ #
+ def is_content_type?(type)
+ content_types.any? { |content| content.include?(type) }
+ end
+
+ #
# Provides transparent access to the values in `headers`.
#
def method_missing(sym,*args,&block)
if (args.empty? && block.nil?)
name = sym.id2name.sub('_','-')
@@ -516,8 +535,8 @@
return @response[name] if @response.key?(name)
end
return super(sym,*args,&block)
end
-
+
end
end