lib/spidr/page.rb in spidr-0.2.7 vs lib/spidr/page.rb in spidr-0.3.0
- old
+ new
@@ -1,19 +1,18 @@
-require 'spidr/extensions/uri'
+require 'spidr/headers'
+require 'spidr/body'
+require 'spidr/links'
-require 'set'
-require 'uri'
-require 'nokogiri'
-
module Spidr
#
# Represents a requested page from a website.
#
class Page
- # Reserved names used within Cookie strings
- RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain']
+ include Headers
+ include Body
+ include Links
# URL of the page
attr_reader :url
# HTTP Response
@@ -37,31 +36,26 @@
@headers = response.to_hash
@doc = nil
end
#
- # The response code from the page.
+ # The meta-redirect links of the page.
#
- # @return [Integer]
- # Response code from the page.
+ # @return [Array<String>]
+ # All meta-redirect links in the page.
#
- def code
- @response.code.to_i
- end
-
+ # @deprecated
+ # Deprecated in 0.3.0 and will be removed in 0.4.0.
+ # Use {#meta_redirects} instead.
#
- # Determines if the response code is `200`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `200`.
- #
- def is_ok?
- code == 200
+ def meta_redirect
+ STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
+ STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead'
+
+ meta_redirects
end
- alias ok? is_ok?
-
#
# Determines if the response code is `300`, `301`, `302`, `303`
# or `307`. Also checks for "soft" redirects added at the page
# level by a meta refresh tag.
#
@@ -79,534 +73,36 @@
end
end
alias redirect? is_redirect?
- #
- # Determines if the response code is `308`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `308`.
- #
- def timedout?
- code == 308
- end
+ protected
#
- # Determines if the response code is `400`.
+ # Provides transparent access to the values in {#headers}.
#
- # @return [Boolean]
- # Specifies whether the response code is `400`.
+ # @param [Symbol] name
+ # The name of the missing method.
#
- def bad_request?
- code == 400
- end
-
+ # @param [Array] arguments
+ # Additional arguments for the missing method.
#
- # Determines if the response code is `401`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `401`.
- #
- def is_unauthorized?
- code == 401
- end
-
- alias unauthorized? is_unauthorized?
-
- #
- # Determines if the response code is `403`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `403`.
- #
- def is_forbidden?
- code == 403
- end
-
- alias forbidden? is_forbidden?
-
- #
- # Determines if the response code is `404`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `404`.
- #
- def is_missing?
- code == 404
- end
-
- alias missing? is_missing?
-
- #
- # Determines if the response code is `500`.
- #
- # @return [Boolean]
- # Specifies whether the response code is `500`.
- #
- def had_internal_server_error?
- code == 500
- end
-
- #
- # The Content-Type of the page.
- #
# @return [String]
- # The Content-Type of the page.
+ # The missing method mapped to a header in {#headers}.
#
- def content_type
- (@response['Content-Type'] || '')
- end
-
+ # @raise [NoMethodError]
+ # The missing method did not map to a header in {#headers}.
#
- # The content types of the page.
- #
- # @return [Array<String>]
- # The values within the Content-Type header.
- #
- # @since 0.2.2
- #
- def content_types
- (@headers['content-type'] || [])
- end
+ def method_missing(name,*arguments,&block)
+ if (arguments.empty? && block.nil?)
+ header_name = name.to_s.sub('_','-')
- #
- # Determines if the page is plain-text.
- #
- # @return [Boolean]
- # Specifies whether the page is plain-text.
- #
- def plain_text?
- is_content_type?('text/plain')
- end
-
- alias txt? plain_text?
-
- #
- # Determines if the page is HTML document.
- #
- # @return [Boolean]
- # Specifies whether the page is HTML document.
- #
- def html?
- is_content_type?('text/html')
- end
-
- #
- # Determines if the page is XML document.
- #
- # @return [Boolean]
- # Specifies whether the page is XML document.
- #
- def xml?
- is_content_type?('text/xml')
- end
-
- #
- # Determines if the page is XML Stylesheet (XSL).
- #
- # @return [Boolean]
- # Specifies whether the page is XML Stylesheet (XSL).
- #
- def xsl?
- is_content_type?('text/xsl')
- end
-
- #
- # Determines if the page is JavaScript.
- #
- # @return [Boolean]
- # Specifies whether the page is JavaScript.
- #
- def javascript?
- is_content_type?('text/javascript') || \
- is_content_type?('application/javascript')
- end
-
- #
- # Determines if the page is a CSS stylesheet.
- #
- # @return [Boolean]
- # Specifies whether the page is a CSS stylesheet.
- #
- def css?
- is_content_type?('text/css')
- end
-
- #
- # Determines if the page is a RSS feed.
- #
- # @return [Boolean]
- # Specifies whether the page is a RSS feed.
- #
- def rss?
- is_content_type?('application/rss+xml') || \
- is_content_type?('application/rdf+xml')
- end
-
- #
- # Determines if the page is an Atom feed.
- #
- # @return [Boolean]
- # Specifies whether the page is an Atom feed.
- #
- def atom?
- is_content_type?('application/atom+xml')
- end
-
- #
- # Determines if the page is a MS Word document.
- #
- # @return [Boolean]
- # Specifies whether the page is a MS Word document.
- #
- def ms_word?
- is_content_type?('application/msword')
- end
-
- #
- # Determines if the page is a PDF document.
- #
- # @return [Boolean]
- # Specifies whether the page is a PDF document.
- #
- def pdf?
- is_content_type?('application/pdf')
- end
-
- #
- # Determines if the page is a ZIP archive.
- #
- # @return [Boolean]
- # Specifies whether the page is a ZIP archive.
- #
- def zip?
- is_content_type?('application/zip')
- end
-
- #
- # The raw Cookie String sent along with the page.
- #
- # @return [String]
- # The raw Cookie from the response.
- #
- # @since 0.2.7
- #
- def raw_cookie
- (@response['Set-Cookie'] || '')
- end
-
- #
- # The raw Cookie String sent along with the page.
- #
- # @return [String]
- # The raw Cookie from the response.
- #
- # @deprecated
- # Deprecated in 0.2.7 and will be removed in 0.3.0.
- # Use {#raw_cookie} instead.
- #
- # @since 0.2.2
- #
- def cookie
- STDERR.puts 'DEPRECATION: Spidr::Page#cookie will be removed in 0.3.0'
- STDERR.puts 'DEPRECATION: Use Spidr::Page#raw_cookie instead'
-
- return raw_cookie
- end
-
- #
- # The Cookie values sent along with the page.
- #
- # @return [Array<String>]
- # The Cookies from the response.
- #
- # @since 0.2.2
- #
- def cookies
- (@headers['set-cookie'] || [])
- end
-
- #
- # The Cookie key -> value pairs returned with the response.
- #
- # @return [Hash{String => String}]
- # The cookie keys and values.
- #
- # @since 0.2.2
- #
- def cookie_params
- params = {}
-
- cookies.each do |cookie|
- cookie.split('; ').each do |key_value|
- key, value = key_value.split('=',2)
-
- next if RESERVED_COOKIE_NAMES.include?(key)
-
- params[key] = (value || '')
+ if @response.key?(header_name)
+ return @response[header_name]
end
end
- return params
- end
-
- #
- # The body of the response.
- #
- # @return [String]
- # The body of the response.
- #
- def body
- (@response.body || '')
- end
-
- #
- # Returns a parsed document object for HTML, XML, RSS and Atom pages.
- #
- # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
- # The document that represents HTML or XML pages.
- # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
- # the page could not be parsed properly.
- #
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
- #
- def doc
- return nil if body.empty?
-
- begin
- if html?
- return @doc ||= Nokogiri::HTML(body)
- elsif (xml? || xsl? || rss? || atom?)
- return @doc ||= Nokogiri::XML(body)
- end
- rescue
- return nil
- end
- end
-
- #
- # Searches the document for XPath or CSS Path paths.
- #
- # @param [Array<String>] paths
- # CSS or XPath expressions to search the document with.
- #
- # @return [Array]
- # The matched nodes from the document.
- # Returns an empty Array if no nodes were matched, or if the page
- # is not an HTML or XML document.
- #
- # @example
- # page.search('//a[@href]')
- #
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
- #
- def search(*paths)
- if doc
- doc.search(*paths)
- else
- []
- end
- end
-
- #
- # Searches for the first occurrence an XPath or CSS Path expression.
- #
- # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
- # The first matched node. Returns `nil` if no nodes could be matched,
- # or if the page is not a HTML or XML document.
- #
- # @example
- # page.at('//title')
- #
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
- #
- def at(*arguments)
- if doc
- doc.at(*arguments)
- end
- end
-
- alias / search
- alias % at
-
- #
- # The title of the HTML page.
- #
- # @return [String]
- # The inner-text of the title element of the page.
- #
- def title
- if (node = at('//title'))
- node.inner_text
- end
- end
-
- #
- # The links from within the page.
- #
- # @return [Array<String>]
- # All links within the HTML page, frame/iframe source URLs and any
- # links in the `Location` header.
- #
- def links
- urls = []
-
- add_url = lambda { |url|
- urls << url unless (url.nil? || url.empty?)
- }
-
- self.redirects_to.each(&add_url) if self.is_redirect?
-
- if (html? && doc)
- doc.search('a[@href]').each do |a|
- add_url.call(a.get_attribute('href'))
- end
-
- doc.search('frame[@src]').each do |iframe|
- add_url.call(iframe.get_attribute('src'))
- end
-
- doc.search('iframe[@src]').each do |iframe|
- add_url.call(iframe.get_attribute('src'))
- end
-
- doc.search('link[@href]').each do |link|
- add_url.call(link.get_attribute('href'))
- end
-
- doc.search('script[@src]').each do |script|
- add_url.call(script.get_attribute('src'))
- end
- end
-
- return urls
- end
-
- #
- # URL(s) that this document redirects to.
- #
- # @return [Array<String>]
- # The links that this page redirects to (usually found in a
- # location header or by way of a page-level meta redirect).
- #
- def redirects_to
- location = @headers['location']
-
- if location.nil?
- # check page-level meta redirects if there isn't a location header
- meta_redirect
- elsif location.kind_of?(Array)
- location
- else
- # usually the location header contains a single String
- [location]
- end
- end
-
- #
- # Absolute URIs from within the page.
- #
- # @return [Array<URI::HTTP>]
- # The links from within the page, converted to absolute URIs.
- #
- def urls
- links.map { |link| to_absolute(link) }.compact
- end
-
- #
- # Normalizes and expands a given link into a proper URI.
- #
- # @param [String] link
- # The link to normalize and expand.
- #
- # @return [URI::HTTP]
- # The normalized URI.
- #
- def to_absolute(link)
- begin
- url = @url.merge(link.to_s)
- rescue URI::InvalidURIError, URI::InvalidComponentError
- return nil
- end
-
- unless (url.path.nil? || url.path.empty?)
- # make sure the path does not contain any .. or . directories,
- # since URI::Generic#merge cannot normalize paths such as
- # "/stuff/../"
- url.path = URI.expand_path(url.path)
- end
-
- return url
- end
-
- #
- # Determines if a page-level "soft" redirect is present. If yes,
- # returns an array of those redirects (usually a single URL).
- # Otherwise, returns false.
- #
- # @return [Array<String>]
- # An array of redirect URLs
- #
- def meta_redirect
- redirects = []
-
- if (html? && doc)
- search('//meta[@http-equiv and @content]').each do |node|
- if node.get_attribute('http-equiv') =~ /refresh/i
- content = node.get_attribute('content')
-
- if (redirect = content.match(/url=(\S+)$/))
- redirects << redirect[1]
- end
- end
- end
- end
-
- return redirects.uniq
- end
-
- #
- # Returns a boolean indicating whether or not page-level meta
- # redirects are present in this page.
- #
- # @return [Boolean]
- # Specifies whether the page includes page-level redirects.
- #
- def meta_redirect?
- !meta_redirect.empty?
- end
-
- protected
-
- #
- # Determines if any of the content-types of the page include a given
- # type.
- #
- # @param [String] type
- # The content-type to test for.
- #
- # @return [Boolean]
- # Specifies whether the page includes the given content-type.
- #
- # @since 0.2.4
- #
- def is_content_type?(type)
- content_types.any? { |content| content.include?(type) }
- end
-
- #
- # Provides transparent access to the values in `headers`.
- #
- def method_missing(sym,*args,&block)
- if (args.empty? && block.nil?)
- name = sym.id2name.sub('_','-')
-
- return @response[name] if @response.key?(name)
- end
-
- return super(sym,*args,&block)
+ return super(name,*arguments,&block)
end
end
end