lib/spidr/page.rb in spidr-0.1.3 vs lib/spidr/page.rb in spidr-0.1.4
- old
+ new
@@ -1,7 +1,7 @@
require 'uri'
-require 'hpricot'
+require 'nokogiri'
module Spidr
class Page
# URL of the page
@@ -183,27 +183,32 @@
def body
@response.body
end
#
- # Returns an Hpricot::Doc if the page represents a HTML document,
- # returns +nil+ otherwise.
+ # If the page has a <tt>text/html</tt> content-type, a
+ # Nokogiri::HTML::Document object will be returned. If the page has a
+ # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
+ # will be returned. Other content-types will cause +nil+ to be
+ # returned.
#
def doc
if html?
- return @doc ||= Hpricot(body)
+ return @doc ||= Nokogiri::HTML(body)
+ elsif xml?
+ return @doc ||= Nokogiri::XML(body)
end
end
#
# Returns all links from the HTML page.
#
def links
urls = []
if html?
- doc.search('a[@href]') do |a|
- url = a.attributes['href'].strip
+ self.doc.search('a[@href]').each do |a|
+ url = a.get_attribute('href')
urls << url unless url.empty?
end
end