lib/spidr/page.rb in spidr-0.1.9 vs lib/spidr/page.rb in spidr-0.2.0
- old
+ new
@@ -1,5 +1,7 @@
+require 'spidr/extensions/uri'
+
require 'uri'
require 'nokogiri'
module Spidr
class Page
@@ -8,219 +10,357 @@
attr_reader :url
# HTTP Response
attr_reader :response
- # Body returned for the page
- attr_reader :body
-
# Headers returned with the body
attr_reader :headers
#
- # Creates a new Page object from the specified _url_ and HTTP
- # _response_.
+ # Creates a new Page object.
#
+ # @param [URI::HTTP] url
+ # The URL of the page.
+ #
+ # @param [Net::HTTP::Response] response
+ # The response from the request for the page.
+ #
def initialize(url,response)
@url = url
@response = response
@headers = response.to_hash
@doc = nil
end
#
- # Returns the response code from the page.
+ # The response code from the page.
#
+ # @return [Integer]
+ # Response code from the page.
+ #
def code
- @response.code
+ @response.code.to_i
end
#
- # Returns +true+ if the response code is 200, returns +false+ otherwise.
+ # Determines if the response code is +200+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +200+.
+ #
def is_ok?
code == 200
end
+ alias ok? is_ok?
+
#
- # Returns +true+ if the response code is 301 or 307, returns +false+
- # otherwise.
+ # Determines if the response code is +301+ or +307+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +301+ or +307+.
+ #
def is_redirect?
(code == 301 || code == 307)
end
+ alias redirect? is_redirect?
+
#
- # Returns +true+ if the response code is 308, returns +false+ otherwise.
+ # Determines if the response code is +308+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +308+.
+ #
def timedout?
code == 308
end
#
- # Returns +true+ if the response code is 400, returns +false+ otherwise.
+ # Determines if the response code is +400+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +400+.
+ #
def bad_request?
code == 400
end
#
- # Returns +true+ if the response code is 401, returns +false+ otherwise.
+ # Determines if the response code is +401+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +401+.
+ #
def is_unauthorized?
code == 401
end
+ alias unauthorized? is_unauthorized?
+
#
- # Returns +true+ if the response code is 403, returns +false+ otherwise.
+ # Determines if the response code is +403+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +403+.
+ #
def is_forbidden?
code == 403
end
+ alias forbidden? is_forbidden?
+
#
- # Returns +true+ if the response code is 404, returns +false+ otherwise.
+ # Determines if the response code is +404+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +404+.
+ #
def is_missing?
code == 404
end
+ alias missing? is_missing?
+
#
- # Returns +true+ if the response code is 500, returns +false+ otherwise.
+ # Determines if the response code is +500+.
#
+ # @return [Boolean]
+ # Specifies whether the response code is +500+.
+ #
def had_internal_server_error?
code == 500
end
#
- # Returns the content-type of the page.
+ # The Content-Type of the page.
#
+ # @return [String]
+ # The Content-Type of the page.
+ #
def content_type
@response['Content-Type']
end
#
- # Returns +true+ if the page is a plain text document, returns +false+
- # otherwise.
+ # Determines if the page is plain-text.
#
+ # @return [Boolean]
+ # Specifies whether the page is plain-text.
+ #
def plain_text?
(content_type =~ /text\/plain/) == 0
end
+ alias txt? plain_text?
+
#
- # Returns +true+ if the page is a HTML document, returns +false+
- # otherwise.
+ # Determines if the page is HTML document.
#
+ # @return [Boolean]
+ # Specifies whether the page is HTML document.
+ #
def html?
(content_type =~ /text\/html/) == 0
end
#
- # Returns +true+ if the page is a XML document, returns +false+
- # otherwise.
+ # Determines if the page is XML document.
#
+ # @return [Boolean]
+ # Specifies whether the page is XML document.
+ #
def xml?
(content_type =~ /text\/xml/) == 0
end
#
- # Returns +true+ if the page is a Javascript file, returns +false+
- # otherwise.
+ # Determines if the page is JavaScript.
#
+ # @return [Boolean]
+ # Specifies whether the page is JavaScript.
+ #
def javascript?
(content_type =~ /(text|application)\/javascript/) == 0
end
#
- # Returns +true+ if the page is a CSS file, returns +false+
- # otherwise.
+ # Determines if the page is a CSS stylesheet.
#
+ # @return [Boolean]
+ # Specifies whether the page is a CSS stylesheet.
+ #
def css?
(content_type =~ /text\/css/) == 0
end
#
- # Returns +true+ if the page is a RSS/RDF feed, returns +false+
- # otherwise.
+ # Determines if the page is a RSS feed.
#
+ # @return [Boolean]
+ # Specifies whether the page is a RSS feed.
+ #
def rss?
(content_type =~ /application\/(rss|rdf)\+xml/) == 0
end
#
- # Returns +true+ if the page is a Atom feed, returns +false+
- # otherwise.
+ # Determines if the page is an Atom feed.
#
+ # @return [Boolean]
+ # Specifies whether the page is an Atom feed.
+ #
def atom?
(content_type =~ /application\/atom\+xml/) == 0
end
#
- # Returns +true+ if the page is a MS Word document, returns +false+
- # otherwise.
+ # Determines if the page is a MS Word document.
#
+ # @return [Boolean]
+ # Specifies whether the page is a MS Word document.
+ #
def ms_word?
(content_type =~ /application\/msword/) == 0
end
#
- # Returns +true+ if the page is a PDF document, returns +false+
- # otherwise.
+ # Determines if the page is a PDF document.
#
+ # @return [Boolean]
+ # Specifies whether the page is a PDF document.
+ #
def pdf?
(content_type =~ /application\/pdf/) == 0
end
#
- # Returns +true+ if the page is a ZIP archive, returns +false+
- # otherwise.
+ # Determines if the page is a ZIP archive.
#
+ # @return [Boolean]
+ # Specifies whether the page is a ZIP archive.
+ #
def zip?
(content_type =~ /application\/zip/) == 0
end
#
- # Returns the body of the page in +String+ form.
+ # The body of the response.
#
+ # @return [String]
+ # The body of the response.
+ #
def body
@response.body
end
#
- # If the page has a <tt>text/html</tt> content-type, a
- # Nokogiri::HTML::Document object will be returned. If the page has a
- # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
- # will be returned. Other content-types will cause +nil+ to be
- # returned.
+ # Returns a parsed document object for HTML, XML, RSS and Atom pages.
#
+ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
+ # The document that represents HTML or XML pages.
+ # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
+ # the page could not be parsed properly.
+ #
def doc
return nil if (body.nil? || body.empty?)
begin
if html?
return @doc ||= Nokogiri::HTML(body)
- elsif xml?
+ elsif (xml? || rss? || atom?)
return @doc ||= Nokogiri::XML(body)
end
rescue
return nil
end
end
#
- # Returns all links from the HTML page.
+ # Searches the document for XPath or CSS Path paths.
#
+ # @param [Array<String>] paths
+ # CSS or XPath expressions to search the document with.
+ #
+ # @return [Array]
+ # The matched nodes from the document.
+ # Returns an empty Array if no nodes were matched, or if the page
+ # is not an HTML or XML document.
+ #
+ # @example
+ # page.search('//a[@href]')
+ #
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
+ #
+ def search(*paths)
+ if doc
+ return doc.search(*paths)
+ end
+
+ return []
+ end
+
+ #
+ # Searches for the first occurrence an XPath or CSS Path expression.
+ #
+ # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
+ # The first matched node. Returns +nil+ if no nodes could be matched,
+ # or if the page is not a HTML or XML document.
+ #
+ # @example
+ # page.at('//title')
+ #
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
+ #
+ def at(*arguments)
+ if doc
+ return doc.at(*arguments)
+ end
+
+ return nil
+ end
+
+ alias / search
+ alias % at
+
+ #
+ # The title of the HTML page.
+ #
+ # @return [String]
+ # The inner-text of the title element of the page.
+ #
+ def title
+ if (node = at('//title'))
+ return node.inner_text
+ end
+ end
+
+ #
+ # The links from within the page.
+ #
+ # @return [Array<String>]
+ # All links within the HTML page, frame/iframe source URLs and any
+ # links in the +Location+ header.
+ #
def links
urls = []
add_url = lambda { |url|
urls << url unless (url.nil? || url.empty?)
}
case code
when 300..303, 307
- add_url.call(@headers['location'])
+ location = @headers['location']
+
+ if location.kind_of?(Array)
+ # handle multiple location URLs
+ location.each(&add_url)
+ else
+ # usually the location header contains a single String
+ add_url.call(location)
+ end
end
if (html? && doc)
doc.search('a[@href]').each do |a|
add_url.call(a.get_attribute('href'))
@@ -237,47 +377,48 @@
return urls
end
#
- # Returns all links from the HtML page as absolute URLs.
+ # Absolute URIs from within the page.
#
+ # @return [Array<URI::HTTP>]
+ # The links from within the page, converted to absolute URIs.
+ #
def urls
links.map { |link| to_absolute(link) }.compact
end
- protected
-
#
- # Converts the specified _link_ into an absolute URL
- # based on the url of the page.
+ # Normalizes and expands a given link into a proper URI.
#
+ # @param [String] link
+ # The link to normalize and expand.
+ #
+ # @return [URI::HTTP]
+ # The normalized URI.
+ #
def to_absolute(link)
- # decode, clean then re-encode the URL
- link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,''))
-
begin
- relative = URI(link)
- absolute = @url.merge(relative)
-
- if absolute.path
- if absolute.path.empty?
- # default the absolute path to '/'
- absolute.path = '/'
- else
- # make sure the path does not contain any .. or . directories.
- absolute.path = File.expand_path(absolute.path)
- end
- end
-
- return absolute
- rescue URI::InvalidURIError => e
+ url = @url.merge(link.to_s)
+ rescue URI::InvalidURIError
return nil
end
+
+ unless (url.path.nil? || url.path.empty?)
+ # make sure the path does not contain any .. or . directories,
+ # since URI::Generic#merge cannot normalize paths such as
+ # "/stuff/../"
+ url.path = URI.expand_path(url.path)
+ end
+
+ return url
end
+ protected
+
#
- # Provides transparent access to the values in the +headers+ +Hash+.
+ # Provides transparent access to the values in +headers+.
#
def method_missing(sym,*args,&block)
if (args.empty? && block.nil?)
name = sym.id2name.sub('_','-')