lib/anemone/page.rb in anemone-0.2.3 vs lib/anemone/page.rb in anemone-0.3.0
- old
+ new
@@ -6,150 +6,144 @@
# The URL of the page
attr_reader :url
# Headers of the HTTP response
attr_reader :headers
-
+ # URL of the page this one redirected to, if any
+ attr_reader :redirect_to
+ # Exception object, if one was raised during HTTP#fetch_page
+ attr_reader :error
+
# OpenStruct for user-stored data
attr_accessor :data
- # Nokogiri document for the HTML body
- attr_accessor :doc
# Integer response code of the page
- attr_accessor :code
- # Array of redirect-aliases for the page
- attr_accessor :aliases
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
+ attr_accessor :code
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
attr_accessor :visited
# Depth of this page from the root of the crawl. This is not necessarily the
- # shortest path; use PageHash#shortest_paths! to find that value.
+ # shortest path; use PageStore#shortest_paths! to find that value.
attr_accessor :depth
# URL of the page that brought us to this page
attr_accessor :referer
# Response time of the request for this page in milliseconds
attr_accessor :response_time
-
+
#
# Create a new page
#
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
+ def initialize(url, params = {})
@url = url
- @code = code
- @headers = headers || {}
- @headers['content-type'] ||= ['']
- @aliases = Array(aka)
@data = OpenStruct.new
- @referer = referer
- @depth = depth || 0
- @response_time = response_time
- @doc = Nokogiri::HTML(body) if body && html? rescue nil
+
+ @code = params[:code]
+ @headers = params[:headers] || {}
+ @headers['content-type'] ||= ['']
+ @aliases = Array(params[:aka]).compact
+ @referer = params[:referer]
+ @depth = params[:depth] || 0
+ @redirect_to = to_absolute(params[:redirect_to])
+ @response_time = params[:response_time]
+ @body = params[:body]
+ @error = params[:error]
+
+ @fetched = !params[:code].nil?
end
# Array of distinct A tag HREFs from the page
def links
return @links unless @links.nil?
@links = []
return @links if !doc
-
+
doc.css('a').each do |a|
u = a.attributes['href'].content rescue nil
next if u.nil? or u.empty?
abs = to_absolute(URI(u)) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
@links
end
-
+
+ # Nokogiri document for the HTML body
+ def doc
+ return @doc if @doc
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
+ end
+
+ # Delete the Nokogiri document and response body to conserve memory
def discard_doc!
links # force parsing of page links before we trash the document
- @doc = nil
+ @doc = @body = nil
end
-
- #
- # Return a new page with the same *response* and *url*, but
- # with a 200 response code
- #
- def alias_clone(url)
- p = clone
- p.add_alias!(@aka) if !@aka.nil?
- p.code = 200
- p
+
+ def fetched?
+ @fetched
end
#
- # Add a redirect-alias String *aka* to the list of the page's aliases
- #
- # Returns *self*
- #
- def add_alias!(aka)
- @aliases << aka if !@aliases.include?(aka)
- self
- end
-
- #
- # Returns an Array of all links from this page, and all the
- # redirect-aliases of those pages, as String objects.
- #
- # *page_hash* is a PageHash object with the results of the current crawl.
- #
- def links_and_their_aliases(page_hash)
- links.inject([]) do |results, link|
- results.concat([link].concat(page_hash[link].aliases))
- end
- end
-
- #
# The content-type returned by the HTTP request for this page
#
def content_type
headers['content-type'].first
end
-
+
#
# Returns +true+ if the page is a HTML document, returns +false+
# otherwise.
#
def html?
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end
-
+
#
# Returns +true+ if the page is a HTTP redirect, returns +false+
# otherwise.
- #
+ #
def redirect?
(300..399).include?(@code)
end
-
+
#
# Returns +true+ if the page was not found (returned 404 code),
# returns +false+ otherwise.
#
def not_found?
404 == @code
end
-
+
#
# Converts relative URL *link* into an absolute URL based on the
# location of the page
#
def to_absolute(link)
+ return nil if link.nil?
+
# remove anchor
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
relative = URI(link)
absolute = @url.merge(relative)
absolute.path = '/' if absolute.path.empty?
return absolute
end
-
+
#
# Returns +true+ if *uri* is in the same domain as the page, returns
# +false+ otherwise
#
def in_domain?(uri)
uri.host == @url.host
end
+
+ def marshal_dump
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
+ end
+
+ def marshal_load(ary)
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
+ end
+
end
end