lib/spidr/page.rb in spidr-0.2.7 vs lib/spidr/page.rb in spidr-0.3.0

- old
+ new

@@ -1,19 +1,18 @@ -require 'spidr/extensions/uri' +require 'spidr/headers' +require 'spidr/body' +require 'spidr/links' -require 'set' -require 'uri' -require 'nokogiri' - module Spidr # # Represents a requested page from a website. # class Page - # Reserved names used within Cookie strings - RESERVED_COOKIE_NAMES = Set['path', 'expires', 'domain'] + include Headers + include Body + include Links # URL of the page attr_reader :url # HTTP Response @@ -37,31 +36,26 @@ @headers = response.to_hash @doc = nil end # - # The response code from the page. + # The meta-redirect links of the page. # - # @return [Integer] - # Response code from the page. + # @return [Array<String>] + # All meta-redirect links in the page. # - def code - @response.code.to_i - end - + # @deprecated + # Deprecated in 0.3.0 and will be removed in 0.4.0. + # Use {#meta_redirects} instead. # - # Determines if the response code is `200`. - # - # @return [Boolean] - # Specifies whether the response code is `200`. - # - def is_ok? - code == 200 + def meta_redirect + STDERR.puts 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0' + STDERR.puts 'DEPRECATION: Use Spidr::Page#meta_redirects instead' + + meta_redirects end - alias ok? is_ok? - # # Determines if the response code is `300`, `301`, `302`, `303` # or `307`. Also checks for "soft" redirects added at the page # level by a meta refresh tag. # @@ -79,534 +73,36 @@ end end alias redirect? is_redirect? - # - # Determines if the response code is `308`. - # - # @return [Boolean] - # Specifies whether the response code is `308`. - # - def timedout? - code == 308 - end + protected # - # Determines if the response code is `400`. + # Provides transparent access to the values in {#headers}. # - # @return [Boolean] - # Specifies whether the response code is `400`. + # @param [Symbol] name + # The name of the missing method. # - def bad_request? - code == 400 - end - + # @param [Array] arguments + # Additional arguments for the missing method. # - # Determines if the response code is `401`. - # - # @return [Boolean] - # Specifies whether the response code is `401`. - # - def is_unauthorized? - code == 401 - end - - alias unauthorized? is_unauthorized? - - # - # Determines if the response code is `403`. - # - # @return [Boolean] - # Specifies whether the response code is `403`. - # - def is_forbidden? - code == 403 - end - - alias forbidden? is_forbidden? - - # - # Determines if the response code is `404`. - # - # @return [Boolean] - # Specifies whether the response code is `404`. - # - def is_missing? - code == 404 - end - - alias missing? is_missing? - - # - # Determines if the response code is `500`. - # - # @return [Boolean] - # Specifies whether the response code is `500`. - # - def had_internal_server_error? - code == 500 - end - - # - # The Content-Type of the page. - # # @return [String] - # The Content-Type of the page. + # The missing method mapped to a header in {#headers}. # - def content_type - (@response['Content-Type'] || '') - end - + # @raise [NoMethodError] + # The missing method did not map to a header in {#headers}. # - # The content types of the page. - # - # @return [Array<String>] - # The values within the Content-Type header. - # - # @since 0.2.2 - # - def content_types - (@headers['content-type'] || []) - end + def method_missing(name,*arguments,&block) + if (arguments.empty? && block.nil?) + header_name = name.to_s.sub('_','-') - # - # Determines if the page is plain-text. - # - # @return [Boolean] - # Specifies whether the page is plain-text. - # - def plain_text? - is_content_type?('text/plain') - end - - alias txt? plain_text? - - # - # Determines if the page is HTML document. - # - # @return [Boolean] - # Specifies whether the page is HTML document. - # - def html? - is_content_type?('text/html') - end - - # - # Determines if the page is XML document. - # - # @return [Boolean] - # Specifies whether the page is XML document. - # - def xml? - is_content_type?('text/xml') - end - - # - # Determines if the page is XML Stylesheet (XSL). - # - # @return [Boolean] - # Specifies whether the page is XML Stylesheet (XSL). - # - def xsl? - is_content_type?('text/xsl') - end - - # - # Determines if the page is JavaScript. - # - # @return [Boolean] - # Specifies whether the page is JavaScript. - # - def javascript? - is_content_type?('text/javascript') || \ - is_content_type?('application/javascript') - end - - # - # Determines if the page is a CSS stylesheet. - # - # @return [Boolean] - # Specifies whether the page is a CSS stylesheet. - # - def css? - is_content_type?('text/css') - end - - # - # Determines if the page is a RSS feed. - # - # @return [Boolean] - # Specifies whether the page is a RSS feed. - # - def rss? - is_content_type?('application/rss+xml') || \ - is_content_type?('application/rdf+xml') - end - - # - # Determines if the page is an Atom feed. - # - # @return [Boolean] - # Specifies whether the page is an Atom feed. - # - def atom? - is_content_type?('application/atom+xml') - end - - # - # Determines if the page is a MS Word document. - # - # @return [Boolean] - # Specifies whether the page is a MS Word document. - # - def ms_word? - is_content_type?('application/msword') - end - - # - # Determines if the page is a PDF document. - # - # @return [Boolean] - # Specifies whether the page is a PDF document. - # - def pdf? - is_content_type?('application/pdf') - end - - # - # Determines if the page is a ZIP archive. - # - # @return [Boolean] - # Specifies whether the page is a ZIP archive. - # - def zip? - is_content_type?('application/zip') - end - - # - # The raw Cookie String sent along with the page. - # - # @return [String] - # The raw Cookie from the response. - # - # @since 0.2.7 - # - def raw_cookie - (@response['Set-Cookie'] || '') - end - - # - # The raw Cookie String sent along with the page. - # - # @return [String] - # The raw Cookie from the response. - # - # @deprecated - # Deprecated in 0.2.7 and will be removed in 0.3.0. - # Use {#raw_cookie} instead. - # - # @since 0.2.2 - # - def cookie - STDERR.puts 'DEPRECATION: Spidr::Page#cookie will be removed in 0.3.0' - STDERR.puts 'DEPRECATION: Use Spidr::Page#raw_cookie instead' - - return raw_cookie - end - - # - # The Cookie values sent along with the page. - # - # @return [Array<String>] - # The Cookies from the response. - # - # @since 0.2.2 - # - def cookies - (@headers['set-cookie'] || []) - end - - # - # The Cookie key -> value pairs returned with the response. - # - # @return [Hash{String => String}] - # The cookie keys and values. - # - # @since 0.2.2 - # - def cookie_params - params = {} - - cookies.each do |cookie| - cookie.split('; ').each do |key_value| - key, value = key_value.split('=',2) - - next if RESERVED_COOKIE_NAMES.include?(key) - - params[key] = (value || '') + if @response.key?(header_name) + return @response[header_name] end end - return params - end - - # - # The body of the response. - # - # @return [String] - # The body of the response. - # - def body - (@response.body || '') - end - - # - # Returns a parsed document object for HTML, XML, RSS and Atom pages. - # - # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil] - # The document that represents HTML or XML pages. - # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if - # the page could not be parsed properly. - # - # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html - # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html - # - def doc - return nil if body.empty? - - begin - if html? - return @doc ||= Nokogiri::HTML(body) - elsif (xml? || xsl? || rss? || atom?) - return @doc ||= Nokogiri::XML(body) - end - rescue - return nil - end - end - - # - # Searches the document for XPath or CSS Path paths. - # - # @param [Array<String>] paths - # CSS or XPath expressions to search the document with. - # - # @return [Array] - # The matched nodes from the document. - # Returns an empty Array if no nodes were matched, or if the page - # is not an HTML or XML document. - # - # @example - # page.search('//a[@href]') - # - # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239 - # - def search(*paths) - if doc - doc.search(*paths) - else - [] - end - end - - # - # Searches for the first occurrence an XPath or CSS Path expression. - # - # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil] - # The first matched node. Returns `nil` if no nodes could be matched, - # or if the page is not a HTML or XML document. - # - # @example - # page.at('//title') - # - # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251 - # - def at(*arguments) - if doc - doc.at(*arguments) - end - end - - alias / search - alias % at - - # - # The title of the HTML page. - # - # @return [String] - # The inner-text of the title element of the page. - # - def title - if (node = at('//title')) - node.inner_text - end - end - - # - # The links from within the page. - # - # @return [Array<String>] - # All links within the HTML page, frame/iframe source URLs and any - # links in the `Location` header. - # - def links - urls = [] - - add_url = lambda { |url| - urls << url unless (url.nil? || url.empty?) - } - - self.redirects_to.each(&add_url) if self.is_redirect? - - if (html? && doc) - doc.search('a[@href]').each do |a| - add_url.call(a.get_attribute('href')) - end - - doc.search('frame[@src]').each do |iframe| - add_url.call(iframe.get_attribute('src')) - end - - doc.search('iframe[@src]').each do |iframe| - add_url.call(iframe.get_attribute('src')) - end - - doc.search('link[@href]').each do |link| - add_url.call(link.get_attribute('href')) - end - - doc.search('script[@src]').each do |script| - add_url.call(script.get_attribute('src')) - end - end - - return urls - end - - # - # URL(s) that this document redirects to. - # - # @return [Array<String>] - # The links that this page redirects to (usually found in a - # location header or by way of a page-level meta redirect). - # - def redirects_to - location = @headers['location'] - - if location.nil? - # check page-level meta redirects if there isn't a location header - meta_redirect - elsif location.kind_of?(Array) - location - else - # usually the location header contains a single String - [location] - end - end - - # - # Absolute URIs from within the page. - # - # @return [Array<URI::HTTP>] - # The links from within the page, converted to absolute URIs. - # - def urls - links.map { |link| to_absolute(link) }.compact - end - - # - # Normalizes and expands a given link into a proper URI. - # - # @param [String] link - # The link to normalize and expand. - # - # @return [URI::HTTP] - # The normalized URI. - # - def to_absolute(link) - begin - url = @url.merge(link.to_s) - rescue URI::InvalidURIError, URI::InvalidComponentError - return nil - end - - unless (url.path.nil? || url.path.empty?) - # make sure the path does not contain any .. or . directories, - # since URI::Generic#merge cannot normalize paths such as - # "/stuff/../" - url.path = URI.expand_path(url.path) - end - - return url - end - - # - # Determines if a page-level "soft" redirect is present. If yes, - # returns an array of those redirects (usually a single URL). - # Otherwise, returns false. - # - # @return [Array<String>] - # An array of redirect URLs - # - def meta_redirect - redirects = [] - - if (html? && doc) - search('//meta[@http-equiv and @content]').each do |node| - if node.get_attribute('http-equiv') =~ /refresh/i - content = node.get_attribute('content') - - if (redirect = content.match(/url=(\S+)$/)) - redirects << redirect[1] - end - end - end - end - - return redirects.uniq - end - - # - # Returns a boolean indicating whether or not page-level meta - # redirects are present in this page. - # - # @return [Boolean] - # Specifies whether the page includes page-level redirects. - # - def meta_redirect? - !meta_redirect.empty? - end - - protected - - # - # Determines if any of the content-types of the page include a given - # type. - # - # @param [String] type - # The content-type to test for. - # - # @return [Boolean] - # Specifies whether the page includes the given content-type. - # - # @since 0.2.4 - # - def is_content_type?(type) - content_types.any? { |content| content.include?(type) } - end - - # - # Provides transparent access to the values in `headers`. - # - def method_missing(sym,*args,&block) - if (args.empty? && block.nil?) - name = sym.id2name.sub('_','-') - - return @response[name] if @response.key?(name) - end - - return super(sym,*args,&block) + return super(name,*arguments,&block) end end end