lib/anemone/page.rb in anemone-0.0.3 vs lib/anemone/page.rb in anemone-0.0.4
- old
+ new
@@ -1,27 +1,22 @@
require 'anemone/http'
-require 'hpricot'
+require 'nokogiri'
+require 'facets/ostructable'
module Anemone
class Page
+ include OpenStructable
+
# The URL of the page
attr_reader :url
# Array of distinct A tag HREFs from the page
attr_reader :links
- #Body of the HTTP response
- attr_reader :body
#Content-type of the HTTP response
attr_reader :content_type
- #title of the page if it is an HTML document
- attr_reader :title
- #first h1 on the page, if present
- attr_reader :h1
- #first h2 on the page, if present
- attr_reader :h2
- #meta-description of the page, if present
- attr_reader :description
+ #Nokogiri document for the HTML body
+ attr_accessor :doc
# Integer response code of the page
attr_accessor :code
# Array of redirect-aliases for the page
attr_accessor :aliases
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
@@ -52,40 +47,32 @@
#
# Create a new page
#
def initialize(url, body = nil, code = nil, content_type = nil, aka = nil)
@url = url
- @body = body unless Anemone.options.discard_page_bodies
@code = code
@content_type = content_type
@links = []
@aliases = []
-
+
+ #create empty storage for OpenStructable
+ update({})
+
@aliases << aka if !aka.nil?
if body
- h = Hpricot(body)
+ begin
+ @doc = Nokogiri::HTML(body)
+ rescue
+ return
+ end
- #save page title
- title_elem = h.at('title')
- @title = title_elem.inner_html if !title_elem.nil?
+ return if @doc.nil?
- #save page h1
- h1_elem = h.at('h1')
- @h1 = h1_elem.inner_html if !h1_elem.nil?
-
- #save page h2
- h2_elem = h.at('h2')
- @h2 = h2_elem.inner_html if !h2_elem.nil?
-
- #save page meta-description
- description_elem = h.at('meta[@name=description]')
- @description = description_elem['content'] if !description_elem.nil?
-
#get a list of distinct links on the page, in absolute url form
- h.search('a').each do |a|
- u = a['href']
+ @doc.css('a').each do |a|
+ u = a.attribute('href')
next if u.nil?
begin
abs = to_absolute(URI(u))
rescue
@@ -104,12 +91,12 @@
# Return a new page with the same *response* and *url*, but
# with a 200 response code
#
def alias_clone(url)
p = clone
- p.add_alias!(@aka) if !@aka.nil?
- p.code = 200
- p
+ p.add_alias!(@aka) if !@aka.nil?
+ p.code = 200
+ p
end
#
# Add a redirect-alias String *aka* to the list of the page's aliases
#