lib/anemone/page.rb in anemone-0.6.1 vs lib/anemone/page.rb in anemone-0.7.0
- old
+ new
@@ -60,11 +60,11 @@
return @links if !doc
doc.search("//a[@href]").each do |a|
u = a['href']
next if u.nil? or u.empty?
- abs = to_absolute(URI(u)) rescue next
+ abs = to_absolute(URI(URI.escape(u))) rescue next
@links << abs if in_domain?(abs)
end
@links.uniq!
@links
end
@@ -130,21 +130,36 @@
def not_found?
404 == @code
end
#
+ # Base URI from the HTML doc head element
+ # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
+ #
+ def base
+ @base = if doc
+ href = doc.search('//head/base/@href')
+ URI(href.to_s) unless href.nil? rescue nil
+ end unless @base
+
+ return nil if @base && @base.to_s().empty?
+ @base
+ end
+
+
+ #
# Converts relative URL *link* into an absolute URL based on the
# location of the page
#
def to_absolute(link)
return nil if link.nil?
# remove anchor
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
relative = URI(link)
- absolute = @url.merge(relative)
+ absolute = base ? base.merge(relative) : @url.merge(relative)
absolute.path = '/' if absolute.path.empty?
return absolute
end
@@ -188,10 +203,10 @@
'@links' => hash['links'].map { |link| URI(link) },
'@code' => hash['code'].to_i,
'@visited' => hash['visited'],
'@depth' => hash['depth'].to_i,
'@referer' => hash['referer'],
- '@redirect_to' => URI(hash['redirect_to']),
+ '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
'@response_time' => hash['response_time'].to_i,
'@fetched' => hash['fetched']
}.each do |var, value|
page.instance_variable_set(var, value)
end