lib/spidr/page.rb in spidr-0.2.1 vs lib/spidr/page.rb in spidr-0.2.2
- old
+ new
@@ -1,13 +1,17 @@
require 'spidr/extensions/uri'
+require 'set'
require 'uri'
require 'nokogiri'
module Spidr
class Page
+ # Reserved names used within Cookie strings
+ RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
+
# URL of the page
attr_reader :url
# HTTP Response
attr_reader :response
@@ -140,17 +144,29 @@
def content_type
@response['Content-Type']
end
#
+ # The content types of the page.
+ #
+ # @return [Array<String>]
+ # The values within the Content-Type header.
+ #
+ # @since 0.2.2
+ #
+ def content_types
+ @headers['content-type']
+ end
+
+ #
# Determines if the page is plain-text.
#
# @return [Boolean]
# Specifies whether the page is plain-text.
#
def plain_text?
- (content_type =~ /text\/plain/) == 0
+ content_types.include?('text/plain')
end
alias txt? plain_text?
#
@@ -158,100 +174,148 @@
#
# @return [Boolean]
# Specifies whether the page is HTML document.
#
def html?
- (content_type =~ /text\/html/) == 0
+ content_types.include?('text/html')
end
#
# Determines if the page is XML document.
#
# @return [Boolean]
# Specifies whether the page is XML document.
#
def xml?
- (content_type =~ /text\/xml/) == 0
+ content_types.include?('text/xml')
end
#
# Determines if the page is XML Stylesheet (XSL).
#
# @return [Boolean]
# Specifies whether the page is XML Stylesheet (XSL).
#
def xsl?
- (content_type =~ /text\/xsl/) == 0
+ content_types.include?('text/xsl')
end
#
# Determines if the page is JavaScript.
#
# @return [Boolean]
# Specifies whether the page is JavaScript.
#
def javascript?
- (content_type =~ /(text|application)\/javascript/) == 0
+ content_types.include?('text/javascript') || \
+ content_types.include?('application/javascript')
end
#
# Determines if the page is a CSS stylesheet.
#
# @return [Boolean]
# Specifies whether the page is a CSS stylesheet.
#
def css?
- (content_type =~ /text\/css/) == 0
+ content_types.include?('text/css')
end
#
# Determines if the page is a RSS feed.
#
# @return [Boolean]
# Specifies whether the page is a RSS feed.
#
def rss?
- (content_type =~ /application\/(rss|rdf)\+xml/) == 0
+ content_types.include?('application/rss+xml') || \
+ content_types.include?('application/rdf+xml')
end
#
# Determines if the page is an Atom feed.
#
# @return [Boolean]
# Specifies whether the page is an Atom feed.
#
def atom?
- (content_type =~ /application\/atom\+xml/) == 0
+ content_types.include?('application/atom+xml')
end
#
# Determines if the page is a MS Word document.
#
# @return [Boolean]
# Specifies whether the page is a MS Word document.
#
def ms_word?
- (content_type =~ /application\/msword/) == 0
+ content_types.include?('application/msword')
end
#
# Determines if the page is a PDF document.
#
# @return [Boolean]
# Specifies whether the page is a PDF document.
#
def pdf?
- (content_type =~ /application\/pdf/) == 0
+ content_types.include?('application/pdf')
end
#
# Determines if the page is a ZIP archive.
#
# @return [Boolean]
# Specifies whether the page is a ZIP archive.
#
def zip?
- (content_type =~ /application\/zip/) == 0
+ content_types.include?('application/zip')
+ end
+
+ #
+ # The raw Cookie String sent along with the page.
+ #
+ # @return [String]
+ # The raw Cookie from the response.
+ #
+ # @since 0.2.2
+ #
+ def cookie
+ (@response['Set-Cookie'] || '')
+ end
+
+ #
+ # The Cookie values sent along with the page.
+ #
+ # @return [Array<String>]
+ # The Cookies from the response.
+ #
+ # @since 0.2.2
+ #
+ def cookies
+ (@headers['set-cookie'] || [])
+ end
+
+ #
+ # The Cookie key -> value pairs returned with the response.
+ #
+ # @return [Hash{String => String}]
+ # The cookie keys and values.
+ #
+ # @since 0.2.2
+ #
+ def cookie_params
+ params = {}
+
+ cookies.each do |key_value|
+ key, value = key_value.split('=',2)
+
+ next if RESERVED_COOKIE_NAMES.include?(key)
+
+ params[key] = (value || '')
+ end
+
+ return params
end
#
# The body of the response.
#