page.rb in spidr-0.1.5

- old
+ new

@@ -190,24 +190,28 @@
     # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
     # will be returned. Other content-types will cause +nil+ to be
     # returned.
     #
     def doc
-      if html?
-        return @doc ||= Nokogiri::HTML(body)
-      elsif xml?
-        return @doc ||= Nokogiri::XML(body)
+      begin
+        if html?
+          return @doc ||= Nokogiri::HTML(body)
+        elsif xml?
+          return @doc ||= Nokogiri::XML(body)
+        end
+      rescue
+        return nil
       end
     end
 
     #
     # Returns all links from the HTML page.
     #
     def links
       urls = []
 
-      if html?
+      if (html? && self.doc)
         self.doc.search('a[@href]').each do |a|
           url = a.get_attribute('href')
 
           urls << url unless url.empty?
         end
@@ -218,11 +222,11 @@
 
     #
     # Returns all links from the HtML page as absolute URLs.
     #
     def urls
-      links.map { |link| to_absolute(link) }
+      links.map { |link| to_absolute(link) }.compact
     end
 
     protected
 
     #
@@ -231,23 +235,27 @@
     #
     def to_absolute(link)
       # clean the link
       link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
 
-      relative = URI(link)
-      absolute = @url.merge(relative)
+      begin
+        relative = URI(link)
+        absolute = @url.merge(relative)
 
-      if absolute.path
-        if absolute.path.empty?
-          # default the absolute path to '/'
-          absolute.path = '/'
-        else
-          # make sure the path does not contain any .. or . directories.
-          absolute.path = File.expand_path(absolute.path)
+        if absolute.path
+          if absolute.path.empty?
+            # default the absolute path to '/'
+            absolute.path = '/'
+          else
+            # make sure the path does not contain any .. or . directories.
+            absolute.path = File.expand_path(absolute.path)
+          end
         end
-      end
 
-      return absolute
+        return absolute
+      rescue URI::InvalidURIError => e
+        return nil
+      end
     end
 
     #
     # Provides transparent access to the values in the +headers+ +Hash+.
     #