require 'uri' require 'nokogiri' module Spidr class Page # URL of the page attr_reader :url # HTTP Response attr_reader :response # Body returned for the page attr_reader :body # Headers returned with the body attr_reader :headers # # Creates a new Page object from the specified _url_ and HTTP # _response_. # def initialize(url,response) @url = url @response = response @headers = response.to_hash @doc = nil end # # Returns the response code from the page. # def code @response.code end # # Returns +true+ if the response code is 200, returns +false+ otherwise. # def is_ok? code == 200 end # # Returns +true+ if the response code is 301 or 307, returns +false+ # otherwise. # def is_redirect? (code == 301 || code == 307) end # # Returns +true+ if the response code is 308, returns +false+ otherwise. # def timedout? code == 308 end # # Returns +true+ if the response code is 400, returns +false+ otherwise. # def bad_request? code == 400 end # # Returns +true+ if the response code is 401, returns +false+ otherwise. # def is_unauthorized? code == 401 end # # Returns +true+ if the response code is 403, returns +false+ otherwise. # def is_forbidden? code == 403 end # # Returns +true+ if the response code is 404, returns +false+ otherwise. # def is_missing? code == 404 end # # Returns +true+ if the response code is 500, returns +false+ otherwise. # def had_internal_server_error? code == 500 end # # Returns the content-type of the page. # def content_type @response['Content-Type'] end # # Returns +true+ if the page is a plain text document, returns +false+ # otherwise. # def plain_text? (content_type =~ /text\/plain/) == 0 end # # Returns +true+ if the page is a HTML document, returns +false+ # otherwise. # def html? (content_type =~ /text\/html/) == 0 end # # Returns +true+ if the page is a XML document, returns +false+ # otherwise. # def xml? (content_type =~ /text\/xml/) == 0 end # # Returns +true+ if the page is a Javascript file, returns +false+ # otherwise. # def javascript? (content_type =~ /(text|application)\/javascript/) == 0 end # # Returns +true+ if the page is a CSS file, returns +false+ # otherwise. # def css? (content_type =~ /text\/css/) == 0 end # # Returns +true+ if the page is a RSS/RDF feed, returns +false+ # otherwise. # def rss? (content_type =~ /application\/(rss|rdf)\+xml/) == 0 end # # Returns +true+ if the page is a Atom feed, returns +false+ # otherwise. # def atom? (content_type =~ /application\/atom\+xml/) == 0 end # # Returns +true+ if the page is a MS Word document, returns +false+ # otherwise. # def ms_word? (content_type =~ /application\/msword/) == 0 end # # Returns +true+ if the page is a PDF document, returns +false+ # otherwise. # def pdf? (content_type =~ /application\/pdf/) == 0 end # # Returns +true+ if the page is a ZIP archive, returns +false+ # otherwise. # def zip? (content_type =~ /application\/zip/) == 0 end # # Returns the body of the page in +String+ form. # def body @response.body end # # If the page has a text/html content-type, a # Nokogiri::HTML::Document object will be returned. If the page has a # text/xml content-type, a Nokogiri::XML::Document object # will be returned. Other content-types will cause +nil+ to be # returned. # def doc return nil if (body.nil? || body.empty?) begin if html? return @doc ||= Nokogiri::HTML(body) elsif xml? return @doc ||= Nokogiri::XML(body) end rescue return nil end end # # Returns all links from the HTML page. # def links urls = [] add_url = lambda { |url| urls << url unless (url.nil? || url.empty?) } case code when 300..303, 307 add_url.call(@headers['location']) end if (html? && doc) doc.search('a[@href]').each do |a| add_url.call(a.get_attribute('href')) end doc.search('frame[@src]').each do |iframe| add_url.call(iframe.get_attribute('src')) end doc.search('iframe[@src]').each do |iframe| add_url.call(iframe.get_attribute('src')) end end return urls end # # Returns all links from the HtML page as absolute URLs. # def urls links.map { |link| to_absolute(link) }.compact end protected # # Converts the specified _link_ into an absolute URL # based on the url of the page. # def to_absolute(link) # decode, clean then re-encode the URL link = URI.encode(URI.decode(link.to_s).gsub(/#[a-zA-Z0-9_-]*$/,'')) begin relative = URI(link) absolute = @url.merge(relative) if absolute.path if absolute.path.empty? # default the absolute path to '/' absolute.path = '/' else # make sure the path does not contain any .. or . directories. absolute.path = File.expand_path(absolute.path) end end return absolute rescue URI::InvalidURIError => e return nil end end # # Provides transparent access to the values in the +headers+ +Hash+. # def method_missing(sym,*args,&block) if (args.empty? && block.nil?) name = sym.id2name.sub('_','-') return @response[name] if @response.key?(name) end return super(sym,*args,&block) end end end