lib/anemone/page.rb in anemone-0.6.1 vs lib/anemone/page.rb in anemone-0.7.0

- old
+ new

@@ -60,11 +60,11 @@ return @links if !doc doc.search("//a[@href]").each do |a| u = a['href'] next if u.nil? or u.empty? - abs = to_absolute(URI(u)) rescue next + abs = to_absolute(URI(URI.escape(u))) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end @@ -130,21 +130,36 @@ def not_found? 404 == @code end # + # Base URI from the HTML doc head element + # http://www.w3.org/TR/html4/struct/links.html#edef-BASE + # + def base + @base = if doc + href = doc.search('//head/base/@href') + URI(href.to_s) unless href.nil? rescue nil + end unless @base + + return nil if @base && @base.to_s().empty? + @base + end + + + # # Converts relative URL *link* into an absolute URL based on the # location of the page # def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))) relative = URI(link) - absolute = @url.merge(relative) + absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end @@ -188,10 +203,10 @@ '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], - '@redirect_to' => URI(hash['redirect_to']), + '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end