lib/anemone/page.rb in anemone-0.0.3 vs lib/anemone/page.rb in anemone-0.0.4

- old
+ new

@@ -1,27 +1,22 @@ require 'anemone/http' -require 'hpricot' +require 'nokogiri' +require 'facets/ostructable' module Anemone class Page + include OpenStructable + # The URL of the page attr_reader :url # Array of distinct A tag HREFs from the page attr_reader :links - #Body of the HTTP response - attr_reader :body #Content-type of the HTTP response attr_reader :content_type - #title of the page if it is an HTML document - attr_reader :title - #first h1 on the page, if present - attr_reader :h1 - #first h2 on the page, if present - attr_reader :h2 - #meta-description of the page, if present - attr_reader :description + #Nokogiri document for the HTML body + attr_accessor :doc # Integer response code of the page attr_accessor :code # Array of redirect-aliases for the page attr_accessor :aliases # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths! @@ -52,40 +47,32 @@ # # Create a new page # def initialize(url, body = nil, code = nil, content_type = nil, aka = nil) @url = url - @body = body unless Anemone.options.discard_page_bodies @code = code @content_type = content_type @links = [] @aliases = [] - + + #create empty storage for OpenStructable + update({}) + @aliases << aka if !aka.nil? if body - h = Hpricot(body) + begin + @doc = Nokogiri::HTML(body) + rescue + return + end - #save page title - title_elem = h.at('title') - @title = title_elem.inner_html if !title_elem.nil? + return if @doc.nil? - #save page h1 - h1_elem = h.at('h1') - @h1 = h1_elem.inner_html if !h1_elem.nil? - - #save page h2 - h2_elem = h.at('h2') - @h2 = h2_elem.inner_html if !h2_elem.nil? - - #save page meta-description - description_elem = h.at('meta[@name=description]') - @description = description_elem['content'] if !description_elem.nil? - #get a list of distinct links on the page, in absolute url form - h.search('a').each do |a| - u = a['href'] + @doc.css('a').each do |a| + u = a.attribute('href') next if u.nil? begin abs = to_absolute(URI(u)) rescue @@ -104,12 +91,12 @@ # Return a new page with the same *response* and *url*, but # with a 200 response code # def alias_clone(url) p = clone - p.add_alias!(@aka) if !@aka.nil? - p.code = 200 - p + p.add_alias!(@aka) if !@aka.nil? + p.code = 200 + p end # # Add a redirect-alias String *aka* to the list of the page's aliases #