module Sunbro class Page # The URL of the page attr_accessor :url # The raw HTTP response body of the page attr_reader :body # Headers of the HTTP response attr_reader :headers # URL of the page this one redirected to, if any attr_reader :redirect_to # Exception object, if one was raised during HTTP#fetch_page attr_reader :error # Integer response code of the page attr_accessor :code # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths! attr_accessor :visited # Depth of this page from the root of the crawl. This is not necessarily the # shortest path; use PageStore#shortest_paths! to find that value. attr_accessor :depth # URL of the page that brought us to this page attr_accessor :referer # Response time of the request for this page in milliseconds attr_accessor :response_time attr_accessor :redirect_from # # Create a new page # def initialize(url, params = {}) @url = url @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @error = params[:error] @fetched = !params[:code].nil? @force_format = params[:force_format] @body = params[:body] @redirect_from = params[:redirect_from] end # # Nokogiri document for the HTML body # def doc @doc ||= begin if image? nil elsif should_parse_as?(:xml) Nokogiri::XML(@body, @url.to_s) elsif should_parse_as?(:html) Nokogiri::HTML(@body, @url.to_s) elsif @body Nokogiri.parse(@body, @url.to_s) end end end def is_valid? (url != "about:blank") && !not_found? && present? end def present? !error && code && body.present? && doc end # # Delete the Nokogiri document and response body to conserve memory # def discard_doc! @doc = @body = nil end # # Was the page successfully fetched? # +true+ if the page was fetched with no error, +false+ otherwise. # def fetched? @fetched end # # Array of cookies received with this page as WEBrick::Cookie objects. # def cookies WEBrick::Cookie.parse_set_cookies(@headers['Set-Cookie']) rescue [] end # # The content-type returned by the HTTP request for this page # def content_type return headers['content-type'].first if headers['content-type'].first.present? headers['content_type'] end # # Returns +true+ if the page is an image, returns +false+ # otherwise. # def image? !!(content_type =~ %r{^(image/)\b}) end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end # # Returns +true+ if the page is a XML document, returns +false+ # otherwise. # def xml? !!(content_type =~ %r{^(text/xml|application/xml)\b}) end # # Returns +true+ if the page is a HTTP redirect, returns +false+ # otherwise. # def redirect? (300..307).include?(@code) end # # Returns +true+ if the page was not found (returned 404 code), # returns +false+ otherwise. # def not_found? 404 == @code end # # Base URI from the HTML doc head element # http://www.w3.org/TR/html4/struct/links.html#edef-BASE # def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end # # Converts relative URL *link* into an absolute URL based on the # location of the page # def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end # # Returns +true+ if *uri* is in the same domain as the page, returns # +false+ otherwise # def in_domain?(uri) uri.host == @url.host end def marshal_dump [@url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end def marshal_load(ary) @url, @headers, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end def to_hash { 'url' => @url.to_s, 'headers' => headers.to_json, 'body' => @body, 'code' => @code, 'error' => (@error ? @error.to_s : nil), 'visited' => @visited, 'referer' => (@referer ? @referer.to_s : nil), 'redirect_to' => (@redirect_to ? @redirect_to.to_s : nil), 'redirect_from' => (@redirect_from ? @redirect_from.to_s : nil), 'response_time' => @response_time, 'fetched' => @fetched }.reject { |k, v| v.nil? } end def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => JSON.load(hash['headers']), '@body' => hash['body'], '@code' => hash['code'].to_i, '@error' => hash['error'], '@visited' => hash['visited'], '@referer' => hash['referer'], '@redirect_to' => (hash['redirect_to'].present?) ? URI(hash['redirect_to']) : nil, '@redirect_from' => (hash['redirect_from'].present?) ? URI(hash['redirect_from']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end private def cleanup_encoding(source) return source unless source && (html? || xml? || @force_format) text = source.dup text.encode!('UTF-16', 'UTF-8', {:invalid => :replace, :undef => :replace, :replace => '?'}) text.encode('UTF-8', 'UTF-16') end def should_parse_as?(format) return false unless @body return @force_format == format if @force_format send("#{format}?") end end end