lib/spidr/page.rb in spidr-0.1.1 vs lib/spidr/page.rb in spidr-0.1.2

- old
+ new

@@ -196,17 +196,21 @@ # # Returns all links from the HTML page. # def links + urls = [] + if html? - return doc.search('a[@href]').map do |a| - a.attributes['href'].strip + doc.search('a[@href]') do |a| + url = a.attributes['href'].strip + + urls << url unless url.empty? end end - return [] + return urls end # # Returns all links from the HtML page as absolute URLs. # @@ -220,13 +224,25 @@ # Converts the specified _link_ into an absolute URL # based on the url of the page. # def to_absolute(link) # clean the link - link = URI.encode(link.to_s.gsub(/#.*$/,'')) + link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) relative = URI(link) - return @url.merge(relative) + absolute = @url.merge(relative) + + if absolute.path + if absolute.path.empty? + # default the absolute path to '/' + absolute.path = '/' + else + # make sure the path does not contain any .. or . directories. + absolute.path = File.expand_path(absolute.path) + end + end + + return absolute end # # Provides transparent access to the values in the +headers+ +Hash+. #