lib/spidr/page.rb in spidr-0.2.1 vs lib/spidr/page.rb in spidr-0.2.2

- old
+ new

@@ -1,13 +1,17 @@ require 'spidr/extensions/uri' +require 'set' require 'uri' require 'nokogiri' module Spidr class Page + # Reserved names used within Cookie strings + RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain'] + # URL of the page attr_reader :url # HTTP Response attr_reader :response @@ -140,17 +144,29 @@ def content_type @response['Content-Type'] end # + # The content types of the page. + # + # @return [Array<String>] + # The values within the Content-Type header. + # + # @since 0.2.2 + # + def content_types + @headers['content-type'] + end + + # # Determines if the page is plain-text. # # @return [Boolean] # Specifies whether the page is plain-text. # def plain_text? - (content_type =~ /text\/plain/) == 0 + content_types.include?('text/plain') end alias txt? plain_text? # @@ -158,100 +174,148 @@ # # @return [Boolean] # Specifies whether the page is HTML document. # def html? - (content_type =~ /text\/html/) == 0 + content_types.include?('text/html') end # # Determines if the page is XML document. # # @return [Boolean] # Specifies whether the page is XML document. # def xml? - (content_type =~ /text\/xml/) == 0 + content_types.include?('text/xml') end # # Determines if the page is XML Stylesheet (XSL). # # @return [Boolean] # Specifies whether the page is XML Stylesheet (XSL). # def xsl? - (content_type =~ /text\/xsl/) == 0 + content_types.include?('text/xsl') end # # Determines if the page is JavaScript. # # @return [Boolean] # Specifies whether the page is JavaScript. # def javascript? - (content_type =~ /(text|application)\/javascript/) == 0 + content_types.include?('text/javascript') || \ + content_types.include?('application/javascript') end # # Determines if the page is a CSS stylesheet. # # @return [Boolean] # Specifies whether the page is a CSS stylesheet. # def css? - (content_type =~ /text\/css/) == 0 + content_types.include?('text/css') end # # Determines if the page is a RSS feed. # # @return [Boolean] # Specifies whether the page is a RSS feed. # def rss? - (content_type =~ /application\/(rss|rdf)\+xml/) == 0 + content_types.include?('application/rss+xml') || \ + content_types.include?('application/rdf+xml') end # # Determines if the page is an Atom feed. # # @return [Boolean] # Specifies whether the page is an Atom feed. # def atom? - (content_type =~ /application\/atom\+xml/) == 0 + content_types.include?('application/atom+xml') end # # Determines if the page is a MS Word document. # # @return [Boolean] # Specifies whether the page is a MS Word document. # def ms_word? - (content_type =~ /application\/msword/) == 0 + content_types.include?('application/msword') end # # Determines if the page is a PDF document. # # @return [Boolean] # Specifies whether the page is a PDF document. # def pdf? - (content_type =~ /application\/pdf/) == 0 + content_types.include?('application/pdf') end # # Determines if the page is a ZIP archive. # # @return [Boolean] # Specifies whether the page is a ZIP archive. # def zip? - (content_type =~ /application\/zip/) == 0 + content_types.include?('application/zip') + end + + # + # The raw Cookie String sent along with the page. + # + # @return [String] + # The raw Cookie from the response. + # + # @since 0.2.2 + # + def cookie + (@response['Set-Cookie'] || '') + end + + # + # The Cookie values sent along with the page. + # + # @return [Array<String>] + # The Cookies from the response. + # + # @since 0.2.2 + # + def cookies + (@headers['set-cookie'] || []) + end + + # + # The Cookie key -> value pairs returned with the response. + # + # @return [Hash{String => String}] + # The cookie keys and values. + # + # @since 0.2.2 + # + def cookie_params + params = {} + + cookies.each do |key_value| + key, value = key_value.split('=',2) + + next if RESERVED_COOKIE_NAMES.include?(key) + + params[key] = (value || '') + end + + return params end # # The body of the response. #