lib/spidr/page.rb in spidr-0.1.1 vs lib/spidr/page.rb in spidr-0.1.2
- old
+ new
@@ -196,17 +196,21 @@
#
# Returns all links from the HTML page.
#
def links
+ urls = []
+
if html?
- return doc.search('a[@href]').map do |a|
- a.attributes['href'].strip
+ doc.search('a[@href]') do |a|
+ url = a.attributes['href'].strip
+
+ urls << url unless url.empty?
end
end
- return []
+ return urls
end
#
# Returns all links from the HtML page as absolute URLs.
#
@@ -220,13 +224,25 @@
# Converts the specified _link_ into an absolute URL
# based on the url of the page.
#
def to_absolute(link)
# clean the link
- link = URI.encode(link.to_s.gsub(/#.*$/,''))
+ link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
relative = URI(link)
- return @url.merge(relative)
+ absolute = @url.merge(relative)
+
+ if absolute.path
+ if absolute.path.empty?
+ # default the absolute path to '/'
+ absolute.path = '/'
+ else
+ # make sure the path does not contain any .. or . directories.
+ absolute.path = File.expand_path(absolute.path)
+ end
+ end
+
+ return absolute
end
#
# Provides transparent access to the values in the +headers+ +Hash+.
#