page.rb in spidr-0.2.0

- old
+ new

@@ -1,5 +1,7 @@
+require 'spidr/extensions/uri'
+
 require 'uri'
 require 'nokogiri'
 
 module Spidr
   class Page
@@ -8,219 +10,357 @@
     attr_reader :url
 
     # HTTP Response
     attr_reader :response
 
-    # Body returned for the page
-    attr_reader :body
-
     # Headers returned with the body
     attr_reader :headers
 
     #
-    # Creates a new Page object from the specified _url_ and HTTP
-    # _response_.
+    # Creates a new Page object.
     #
+    # @param [URI::HTTP] url
+    #   The URL of the page.
+    #
+    # @param [Net::HTTP::Response] response
+    #   The response from the request for the page.
+    #
     def initialize(url,response)
       @url = url
       @response = response
       @headers = response.to_hash
       @doc = nil
     end
 
     #
-    # Returns the response code from the page.
+    # The response code from the page.
     #
+    # @return [Integer]
+    #   Response code from the page.
+    #
     def code
-      @response.code
+      @response.code.to_i
     end
 
     #
-    # Returns +true+ if the response code is 200, returns +false+ otherwise.
+    # Determines if the response code is +200+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +200+.
+    #
     def is_ok?
       code == 200
     end
 
+    alias ok? is_ok?
+
     #
-    # Returns +true+ if the response code is 301 or 307, returns +false+
-    # otherwise.
+    # Determines if the response code is +301+ or +307+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +301+ or +307+.
+    #
     def is_redirect?
       (code == 301 || code == 307)
     end
 
+    alias redirect? is_redirect?
+
     #
-    # Returns +true+ if the response code is 308, returns +false+ otherwise.
+    # Determines if the response code is +308+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +308+.
+    #
     def timedout?
       code == 308
     end
 
     #
-    # Returns +true+ if the response code is 400, returns +false+ otherwise.
+    # Determines if the response code is +400+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +400+.
+    #
     def bad_request?
       code == 400
     end
 
     #
-    # Returns +true+ if the response code is 401, returns +false+ otherwise.
+    # Determines if the response code is +401+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +401+.
+    #
     def is_unauthorized?
       code == 401
     end
 
+    alias unauthorized? is_unauthorized?
+
     #
-    # Returns +true+ if the response code is 403, returns +false+ otherwise.
+    # Determines if the response code is +403+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +403+.
+    #
     def is_forbidden?
       code == 403
     end
 
+    alias forbidden? is_forbidden?
+
     #
-    # Returns +true+ if the response code is 404, returns +false+ otherwise.
+    # Determines if the response code is +404+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +404+.
+    #
     def is_missing?
       code == 404
     end
 
+    alias missing? is_missing?
+
     #
-    # Returns +true+ if the response code is 500, returns +false+ otherwise.
+    # Determines if the response code is +500+.
     #
+    # @return [Boolean]
+    #   Specifies whether the response code is +500+.
+    #
     def had_internal_server_error?
       code == 500
     end
 
     #
-    # Returns the content-type of the page.
+    # The Content-Type of the page.
     #
+    # @return [String]
+    #   The Content-Type of the page.
+    #
     def content_type
       @response['Content-Type']
     end
 
     #
-    # Returns +true+ if the page is a plain text document, returns +false+
-    # otherwise.
+    # Determines if the page is plain-text.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is plain-text.
+    #
     def plain_text?
       (content_type =~ /text\/plain/) == 0
     end
 
+    alias txt? plain_text?
+
     #
-    # Returns +true+ if the page is a HTML document, returns +false+
-    # otherwise.
+    # Determines if the page is HTML document.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is HTML document.
+    #
     def html?
       (content_type =~ /text\/html/) == 0
     end
 
     #
-    # Returns +true+ if the page is a XML document, returns +false+
-    # otherwise.
+    # Determines if the page is XML document.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is XML document.
+    #
     def xml?
       (content_type =~ /text\/xml/) == 0
     end
 
     #
-    # Returns +true+ if the page is a Javascript file, returns +false+
-    # otherwise.
+    # Determines if the page is JavaScript.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is JavaScript.
+    #
     def javascript?
       (content_type =~ /(text|application)\/javascript/) == 0
     end
 
     #
-    # Returns +true+ if the page is a CSS file, returns +false+
-    # otherwise.
+    # Determines if the page is a CSS stylesheet.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is a CSS stylesheet.
+    #
     def css?
       (content_type =~ /text\/css/) == 0
     end
 
     #
-    # Returns +true+ if the page is a RSS/RDF feed, returns +false+
-    # otherwise.
+    # Determines if the page is a RSS feed.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is a RSS feed.
+    #
     def rss?
       (content_type =~ /application\/(rss|rdf)\+xml/) == 0
     end
 
     #
-    # Returns +true+ if the page is a Atom feed, returns +false+
-    # otherwise.
+    # Determines if the page is an Atom feed.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is an Atom feed.
+    #
     def atom?
       (content_type =~ /application\/atom\+xml/) == 0
     end
 
     #
-    # Returns +true+ if the page is a MS Word document, returns +false+
-    # otherwise.
+    # Determines if the page is a MS Word document.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is a MS Word document.
+    #
     def ms_word?
       (content_type =~ /application\/msword/) == 0
     end
 
     #
-    # Returns +true+ if the page is a PDF document, returns +false+
-    # otherwise.
+    # Determines if the page is a PDF document.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is a PDF document.
+    #
     def pdf?
       (content_type =~ /application\/pdf/) == 0
     end
 
     #
-    # Returns +true+ if the page is a ZIP archive, returns +false+
-    # otherwise.
+    # Determines if the page is a ZIP archive.
     #
+    # @return [Boolean]
+    #   Specifies whether the page is a ZIP archive.
+    #
     def zip?
       (content_type =~ /application\/zip/) == 0
     end
 
     #
-    # Returns the body of the page in +String+ form.
+    # The body of the response.
     #
+    # @return [String]
+    #   The body of the response.
+    #
     def body
       @response.body
     end
 
     #
-    # If the page has a <tt>text/html</tt> content-type, a
-    # Nokogiri::HTML::Document object will be returned. If the page has a
-    # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
-    # will be returned. Other content-types will cause +nil+ to be
-    # returned.
+    # Returns a parsed document object for HTML, XML, RSS and Atom pages.
     #
+    # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
+    #   The document that represents HTML or XML pages.
+    #   Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
+    #   the page could not be parsed properly.
+    #
     def doc
       return nil if (body.nil? || body.empty?)
 
       begin
         if html?
           return @doc ||= Nokogiri::HTML(body)
-        elsif xml?
+        elsif (xml? || rss? || atom?)
           return @doc ||= Nokogiri::XML(body)
         end
       rescue
         return nil
       end
     end
 
     #
-    # Returns all links from the HTML page.
+    # Searches the document for XPath or CSS Path paths.
     #
+    # @param [Array<String>] paths
+    #   CSS or XPath expressions to search the document with.
+    #
+    # @return [Array]
+    #   The matched nodes from the document.
+    #   Returns an empty Array if no nodes were matched, or if the page
+    #   is not an HTML or XML document.
+    #
+    # @example
+    #   page.search('//a[@href]')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
+    #
+    def search(*paths)
+      if doc
+        return doc.search(*paths)
+      end
+
+      return []
+    end
+
+    #
+    # Searches for the first occurrence an XPath or CSS Path expression.
+    #
+    # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
+    #   The first matched node. Returns +nil+ if no nodes could be matched,
+    #   or if the page is not a HTML or XML document.
+    #
+    # @example
+    #   page.at('//title')
+    #
+    # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
+    #
+    def at(*arguments)
+      if doc
+        return doc.at(*arguments)
+      end
+
+      return nil
+    end
+
+    alias / search
+    alias % at
+
+    #
+    # The title of the HTML page.
+    #
+    # @return [String]
+    #   The inner-text of the title element of the page.
+    #
+    def title
+      if (node = at('//title'))
+        return node.inner_text
+      end
+    end
+
+    #
+    # The links from within the page.
+    #
+    # @return [Array<String>]
+    #   All links within the HTML page, frame/iframe source URLs and any
+    #   links in the +Location+ header.
+    #
     def links
       urls = []
 
       add_url = lambda { |url|
         urls << url unless (url.nil? || url.empty?)
       }
 
       case code
       when 300..303, 307
-        add_url.call(@headers['location'])
+        location = @headers['location']
+
+        if location.kind_of?(Array)
+          # handle multiple location URLs
+          location.each(&add_url)
+        else
+          # usually the location header contains a single String
+          add_url.call(location)
+        end
       end
 
       if (html? && doc)
         doc.search('a[@href]').each do |a|
           add_url.call(a.get_attribute('href'))
@@ -237,47 +377,48 @@
 
       return urls
     end
 
     #
-    # Returns all links from the HtML page as absolute URLs.
+    # Absolute URIs from within the page.
     #
+    # @return [Array<URI::HTTP>]
+    #   The links from within the page, converted to absolute URIs.
+    #
     def urls
       links.map { |link| to_absolute(link) }.compact
     end
 
-    protected
-
     #
-    # Converts the specified _link_ into an absolute URL
-    # based on the url of the page.
+    # Normalizes and expands a given link into a proper URI.
     #
+    # @param [String] link
+    #   The link to normalize and expand.
+    #
+    # @return [URI::HTTP]
+    #   The normalized URI.
+    #
     def to_absolute(link)
-      # decode, clean then re-encode the URL
-      link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
-
       begin
-        relative = URI(link)
-        absolute = @url.merge(relative)
-
-        if absolute.path
-          if absolute.path.empty?
-            # default the absolute path to '/'
-            absolute.path = '/'
-          else
-            # make sure the path does not contain any .. or . directories.
-            absolute.path = File.expand_path(absolute.path)
-          end
-        end
-
-        return absolute
-      rescue URI::InvalidURIError => e
+        url = @url.merge(link.to_s)
+      rescue URI::InvalidURIError
         return nil
       end
+
+      unless (url.path.nil? || url.path.empty?)
+        # make sure the path does not contain any .. or . directories,
+        # since URI::Generic#merge cannot normalize paths such as
+        # "/stuff/../"
+        url.path = URI.expand_path(url.path)
+      end
+
+      return url
     end
 
+    protected
+
     #
-    # Provides transparent access to the values in the +headers+ +Hash+.
+    # Provides transparent access to the values in +headers+.
     #
     def method_missing(sym,*args,&block)
       if (args.empty? && block.nil?)
         name = sym.id2name.sub('_','-')