page.rb in spidr-0.1.2

- old
+ new

@@ -196,17 +196,21 @@
 
     #
     # Returns all links from the HTML page.
     #
     def links
+      urls = []
+
       if html?
-        return doc.search('a[@href]').map do |a|
-          a.attributes['href'].strip
+        doc.search('a[@href]') do |a|
+          url = a.attributes['href'].strip
+
+          urls << url unless url.empty?
         end
       end
 
-      return []
+      return urls
     end
 
     #
     # Returns all links from the HtML page as absolute URLs.
     #
@@ -220,13 +224,25 @@
     # Converts the specified _link_ into an absolute URL
     # based on the url of the page.
     #
     def to_absolute(link)
       # clean the link
-      link = URI.encode(link.to_s.gsub(/#.*$/,''))
+      link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
 
       relative = URI(link)
-      return @url.merge(relative)
+      absolute = @url.merge(relative)
+
+      if absolute.path
+        if absolute.path.empty?
+          # default the absolute path to '/'
+          absolute.path = '/'
+        else
+          # make sure the path does not contain any .. or . directories.
+          absolute.path = File.expand_path(absolute.path)
+        end
+      end
+
+      return absolute
     end
 
     #
     # Provides transparent access to the values in the +headers+ +Hash+.
     #