require 'uri' require 'hpricot' module Spidr class Page # URL of the page attr_reader :url # Body returned for the page attr_reader :body # Headers returned with the body attr_reader :headers # # Creates a new Page object from the specified _url_ and HTTP # _response_. # def initialize(url,response) @url = url @response = response @doc = nil end # # Returns the content-type of the page. # def content_type @response['Content-Type'] end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? (content_type =~ /text\/html/) == 0 end # # Returns +true+ if the page is a XML document, returns +false+ # otherwise. # def xml? (content_type =~ /text\/xml/) == 0 end # # Returns +true+ if the page is a Javascript file, returns +false+ # otherwise. # def javascript? (content_type =~ /(text|application)\/javascript/) == 0 end # # Returns +true+ if the page is a CSS file, returns +false+ # otherwise. # def css? (content_type =~ /text\/css/) == 0 end # # Returns +true+ if the page is a RSS/RDF feed, returns +false+ # otherwise. # def rss? (content_type =~ /application\/(rss|rdf)\+xml/) == 0 end # # Returns +true+ if the page is a Atom feed, returns +false+ # otherwise. # def atom? (content_type =~ /application\/atom\+xml/) == 0 end # # Returns the body of the page in +String+ form. # def body @response.body end # # Returns an Hpricot::Doc if the page represents a HTML document, # returns +nil+ otherwise. # def doc if html? return @doc ||= Hpricot(body) end end # # Returns all links from the HTML page. # def links if html? return doc.search('a[@href]').map do |a| a.attributes['href'].strip end end return [] end # # Returns all links from the HtML page as absolute URLs. # def urls links.map { |link| to_absolute(link) } end protected # # Converts the specified _link_ into an absolute URL # based on the url of the page. # def to_absolute(link) link = URI.encode(link.to_s.gsub(/#.*$/,'')) relative = URI(link) if relative.scheme.nil? new_url = @url.clone if relative.path[0..0] == '/' new_url.path = relative.path elsif relative.path[-1..-1] == '/' new_url.path = File.expand_path(File.join(new_url.path,relative.path)) elsif !(relative.path.empty?) new_url.path = File.expand_path(File.join(File.dirname(new_url.path),relative.path)) end return new_url end return relative end # # Provides transparent access to the values in the +headers+ +Hash+. # def method_missing(sym,*args,&block) if (args.empty? && block.nil?) name = sym.id2name.sub('_','-') return @response[name] if @response.has_key?(name) end return super(sym,*args,&block) end end end