lib/anemone/page.rb in anemone-0.2.0 vs lib/anemone/page.rb in anemone-0.2.1
- old
+ new
@@ -1,16 +1,13 @@
-require 'anemone/http'
require 'nokogiri'
require 'ostruct'
module Anemone
class Page
# The URL of the page
attr_reader :url
- # Array of distinct A tag HREFs from the page
- attr_reader :links
# Headers of the HTTP response
attr_reader :headers
# OpenStruct for user-stored data
attr_accessor :data
@@ -25,78 +22,49 @@
# Depth of this page from the root of the crawl. This is not necessarily the
# shortest path; use PageHash#shortest_paths! to find that value.
attr_accessor :depth
# URL of the page that brought us to this page
attr_accessor :referer
+ # Response time of the request for this page in milliseconds
+ attr_accessor :response_time
#
- # Create a new Page from the response of an HTTP request to *url*
- #
- def self.fetch(url, from_page = nil)
- begin
- url = URI(url) unless url.is_a?(URI)
-
- if from_page
- referer = from_page.url
- depth = from_page.depth + 1
- end
-
- response, code, location = Anemone::HTTP.get(url, referer)
-
- aka = nil
- if !url.eql?(location)
- aka = location
- end
-
- return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
- rescue
- return Page.new(url)
- end
- end
-
- #
# Create a new page
#
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
@url = url
@code = code
@headers = headers
- @links = []
- @aliases = []
+ @headers['content-type'] ||= ['']
+ @aliases = Array(aka)
@data = OpenStruct.new
@referer = referer
@depth = depth || 0
+ @response_time = response_time
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
+ end
- @aliases << aka if !aka.nil?
-
- if body
- begin
- @doc = Nokogiri::HTML(body)
- rescue
- return
- end
-
- return if @doc.nil?
-
- #get a list of distinct links on the page, in absolute url form
- @doc.css('a').each do |a|
- u = a.attributes['href'].content if a.attributes['href']
- next if u.nil?
-
- begin
- abs = to_absolute(URI(u))
- rescue
- next
- end
-
- @links << abs if in_domain?(abs)
- end
-
- @links.uniq!
+ # Array of distinct A tag HREFs from the page
+ def links
+ return @links unless @links.nil?
+ @links = []
+ return @links if !doc
+
+ doc.css('a').each do |a|
+ u = a.attributes['href'].content rescue nil
+ next if u.nil? or u.empty?
+ abs = to_absolute(URI(u)) rescue next
+ @links << abs if in_domain?(abs)
end
+ @links.uniq!
+ @links
end
+ def discard_doc!
+ links # force parsing of page links before we trash the document
+ @doc = nil
+ end
#
# Return a new page with the same *response* and *url*, but
# with a 200 response code
#
@@ -122,27 +90,27 @@
# redirect-aliases of those pages, as String objects.
#
# *page_hash* is a PageHash object with the results of the current crawl.
#
def links_and_their_aliases(page_hash)
- @links.inject([]) do |results, link|
+ links.inject([]) do |results, link|
results.concat([link].concat(page_hash[link].aliases))
end
end
#
# The content-type returned by the HTTP request for this page
#
def content_type
- @headers['content-type'][0] rescue nil
+ headers['content-type'].first
end
#
# Returns +true+ if the page is a HTML document, returns +false+
# otherwise.
#
def html?
- (@content_type =~ /text\/html/) == 0
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end
#
# Returns +true+ if the page is a HTTP redirect, returns +false+
# otherwise.