lib/spidr/page.rb in spidr-0.1.9 vs lib/spidr/page.rb in spidr-0.2.0

- old
+ new

@@ -1,5 +1,7 @@ +require 'spidr/extensions/uri' + require 'uri' require 'nokogiri' module Spidr class Page @@ -8,219 +10,357 @@ attr_reader :url # HTTP Response attr_reader :response - # Body returned for the page - attr_reader :body - # Headers returned with the body attr_reader :headers # - # Creates a new Page object from the specified _url_ and HTTP - # _response_. + # Creates a new Page object. # + # @param [URI::HTTP] url + # The URL of the page. + # + # @param [Net::HTTP::Response] response + # The response from the request for the page. + # def initialize(url,response) @url = url @response = response @headers = response.to_hash @doc = nil end # - # Returns the response code from the page. + # The response code from the page. # + # @return [Integer] + # Response code from the page. + # def code - @response.code + @response.code.to_i end # - # Returns +true+ if the response code is 200, returns +false+ otherwise. + # Determines if the response code is +200+. # + # @return [Boolean] + # Specifies whether the response code is +200+. + # def is_ok? code == 200 end + alias ok? is_ok? + # - # Returns +true+ if the response code is 301 or 307, returns +false+ - # otherwise. + # Determines if the response code is +301+ or +307+. # + # @return [Boolean] + # Specifies whether the response code is +301+ or +307+. + # def is_redirect? (code == 301 || code == 307) end + alias redirect? is_redirect? + # - # Returns +true+ if the response code is 308, returns +false+ otherwise. + # Determines if the response code is +308+. # + # @return [Boolean] + # Specifies whether the response code is +308+. + # def timedout? code == 308 end # - # Returns +true+ if the response code is 400, returns +false+ otherwise. + # Determines if the response code is +400+. # + # @return [Boolean] + # Specifies whether the response code is +400+. + # def bad_request? code == 400 end # - # Returns +true+ if the response code is 401, returns +false+ otherwise. + # Determines if the response code is +401+. # + # @return [Boolean] + # Specifies whether the response code is +401+. + # def is_unauthorized? code == 401 end + alias unauthorized? is_unauthorized? + # - # Returns +true+ if the response code is 403, returns +false+ otherwise. + # Determines if the response code is +403+. # + # @return [Boolean] + # Specifies whether the response code is +403+. + # def is_forbidden? code == 403 end + alias forbidden? is_forbidden? + # - # Returns +true+ if the response code is 404, returns +false+ otherwise. + # Determines if the response code is +404+. # + # @return [Boolean] + # Specifies whether the response code is +404+. + # def is_missing? code == 404 end + alias missing? is_missing? + # - # Returns +true+ if the response code is 500, returns +false+ otherwise. + # Determines if the response code is +500+. # + # @return [Boolean] + # Specifies whether the response code is +500+. + # def had_internal_server_error? code == 500 end # - # Returns the content-type of the page. + # The Content-Type of the page. # + # @return [String] + # The Content-Type of the page. + # def content_type @response['Content-Type'] end # - # Returns +true+ if the page is a plain text document, returns +false+ - # otherwise. + # Determines if the page is plain-text. # + # @return [Boolean] + # Specifies whether the page is plain-text. + # def plain_text? (content_type =~ /text\/plain/) == 0 end + alias txt? plain_text? + # - # Returns +true+ if the page is a HTML document, returns +false+ - # otherwise. + # Determines if the page is HTML document. # + # @return [Boolean] + # Specifies whether the page is HTML document. + # def html? (content_type =~ /text\/html/) == 0 end # - # Returns +true+ if the page is a XML document, returns +false+ - # otherwise. + # Determines if the page is XML document. # + # @return [Boolean] + # Specifies whether the page is XML document. + # def xml? (content_type =~ /text\/xml/) == 0 end # - # Returns +true+ if the page is a Javascript file, returns +false+ - # otherwise. + # Determines if the page is JavaScript. # + # @return [Boolean] + # Specifies whether the page is JavaScript. + # def javascript? (content_type =~ /(text|application)\/javascript/) == 0 end # - # Returns +true+ if the page is a CSS file, returns +false+ - # otherwise. + # Determines if the page is a CSS stylesheet. # + # @return [Boolean] + # Specifies whether the page is a CSS stylesheet. + # def css? (content_type =~ /text\/css/) == 0 end # - # Returns +true+ if the page is a RSS/RDF feed, returns +false+ - # otherwise. + # Determines if the page is a RSS feed. # + # @return [Boolean] + # Specifies whether the page is a RSS feed. + # def rss? (content_type =~ /application\/(rss|rdf)\+xml/) == 0 end # - # Returns +true+ if the page is a Atom feed, returns +false+ - # otherwise. + # Determines if the page is an Atom feed. # + # @return [Boolean] + # Specifies whether the page is an Atom feed. + # def atom? (content_type =~ /application\/atom\+xml/) == 0 end # - # Returns +true+ if the page is a MS Word document, returns +false+ - # otherwise. + # Determines if the page is a MS Word document. # + # @return [Boolean] + # Specifies whether the page is a MS Word document. + # def ms_word? (content_type =~ /application\/msword/) == 0 end # - # Returns +true+ if the page is a PDF document, returns +false+ - # otherwise. + # Determines if the page is a PDF document. # + # @return [Boolean] + # Specifies whether the page is a PDF document. + # def pdf? (content_type =~ /application\/pdf/) == 0 end # - # Returns +true+ if the page is a ZIP archive, returns +false+ - # otherwise. + # Determines if the page is a ZIP archive. # + # @return [Boolean] + # Specifies whether the page is a ZIP archive. + # def zip? (content_type =~ /application\/zip/) == 0 end # - # Returns the body of the page in +String+ form. + # The body of the response. # + # @return [String] + # The body of the response. + # def body @response.body end # - # If the page has a <tt>text/html</tt> content-type, a - # Nokogiri::HTML::Document object will be returned. If the page has a - # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object - # will be returned. Other content-types will cause +nil+ to be - # returned. + # Returns a parsed document object for HTML, XML, RSS and Atom pages. # + # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil] + # The document that represents HTML or XML pages. + # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if + # the page could not be parsed properly. + # def doc return nil if (body.nil? || body.empty?) begin if html? return @doc ||= Nokogiri::HTML(body) - elsif xml? + elsif (xml? || rss? || atom?) return @doc ||= Nokogiri::XML(body) end rescue return nil end end # - # Returns all links from the HTML page. + # Searches the document for XPath or CSS Path paths. # + # @param [Array<String>] paths + # CSS or XPath expressions to search the document with. + # + # @return [Array] + # The matched nodes from the document. + # Returns an empty Array if no nodes were matched, or if the page + # is not an HTML or XML document. + # + # @example + # page.search('//a[@href]') + # + # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239 + # + def search(*paths) + if doc + return doc.search(*paths) + end + + return [] + end + + # + # Searches for the first occurrence an XPath or CSS Path expression. + # + # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil] + # The first matched node. Returns +nil+ if no nodes could be matched, + # or if the page is not a HTML or XML document. + # + # @example + # page.at('//title') + # + # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251 + # + def at(*arguments) + if doc + return doc.at(*arguments) + end + + return nil + end + + alias / search + alias % at + + # + # The title of the HTML page. + # + # @return [String] + # The inner-text of the title element of the page. + # + def title + if (node = at('//title')) + return node.inner_text + end + end + + # + # The links from within the page. + # + # @return [Array<String>] + # All links within the HTML page, frame/iframe source URLs and any + # links in the +Location+ header. + # def links urls = [] add_url = lambda { |url| urls << url unless (url.nil? || url.empty?) } case code when 300..303, 307 - add_url.call(@headers['location']) + location = @headers['location'] + + if location.kind_of?(Array) + # handle multiple location URLs + location.each(&add_url) + else + # usually the location header contains a single String + add_url.call(location) + end end if (html? && doc) doc.search('a[@href]').each do |a| add_url.call(a.get_attribute('href')) @@ -237,47 +377,48 @@ return urls end # - # Returns all links from the HtML page as absolute URLs. + # Absolute URIs from within the page. # + # @return [Array<URI::HTTP>] + # The links from within the page, converted to absolute URIs. + # def urls links.map { |link| to_absolute(link) }.compact end - protected - # - # Converts the specified _link_ into an absolute URL - # based on the url of the page. + # Normalizes and expands a given link into a proper URI. # + # @param [String] link + # The link to normalize and expand. + # + # @return [URI::HTTP] + # The normalized URI. + # def to_absolute(link) - # decode, clean then re-encode the URL - link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,'')) - begin - relative = URI(link) - absolute = @url.merge(relative) - - if absolute.path - if absolute.path.empty? - # default the absolute path to '/' - absolute.path = '/' - else - # make sure the path does not contain any .. or . directories. - absolute.path = File.expand_path(absolute.path) - end - end - - return absolute - rescue URI::InvalidURIError => e + url = @url.merge(link.to_s) + rescue URI::InvalidURIError return nil end + + unless (url.path.nil? || url.path.empty?) + # make sure the path does not contain any .. or . directories, + # since URI::Generic#merge cannot normalize paths such as + # "/stuff/../" + url.path = URI.expand_path(url.path) + end + + return url end + protected + # - # Provides transparent access to the values in the +headers+ +Hash+. + # Provides transparent access to the values in +headers+. # def method_missing(sym,*args,&block) if (args.empty? && block.nil?) name = sym.id2name.sub('_','-')