lib/spidr/page.rb in spidr-0.1.3 vs lib/spidr/page.rb in spidr-0.1.4

- old
+ new

@@ -1,7 +1,7 @@ require 'uri' -require 'hpricot' +require 'nokogiri' module Spidr class Page # URL of the page @@ -183,27 +183,32 @@ def body @response.body end # - # Returns an Hpricot::Doc if the page represents a HTML document, - # returns +nil+ otherwise. + # If the page has a <tt>text/html</tt> content-type, a + # Nokogiri::HTML::Document object will be returned. If the page has a + # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object + # will be returned. Other content-types will cause +nil+ to be + # returned. # def doc if html? - return @doc ||= Hpricot(body) + return @doc ||= Nokogiri::HTML(body) + elsif xml? + return @doc ||= Nokogiri::XML(body) end end # # Returns all links from the HTML page. # def links urls = [] if html? - doc.search('a[@href]') do |a| - url = a.attributes['href'].strip + self.doc.search('a[@href]').each do |a| + url = a.get_attribute('href') urls << url unless url.empty? end end