content_link_parser.rb in cobweb-1.1.0

- old
+ new

@@ -6,16 +6,15 @@
 
   # Parses the content and absolutizes the urls based on url.  Options can be setup to determine the links that are extracted.
   def initialize(url, content, options = {})
     @options = {}.merge(options)
     @url = url
+    @base_url = ''
     @doc = Nokogiri::HTML(content)
     
-    base_url = @url.to_s
     if @doc.at("base[href]")
-      base_url = @doc.at("base[href]").attr("href").to_s
-      @url = base_url if base_url
+      @base_url = @doc.at("base[href]").attr("href").to_s if @doc.at("base[href]").attr("href").to_s.present?
     end
 
     @options[:tags] = {}
     @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
     @options[:tags][:images] = [["img[src]", "src"]]
@@ -44,10 +43,12 @@
   # Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes.  Also filters repeating folders (ie if the crawler got in a link loop situation)
   def all_links(options = {})    
     options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
     data = link_data
     links = data.keys.map{|key| data[key]}.flatten.uniq
-    links = links.map{|link| UriHelper.join_no_fragment(@url, link).to_s }
+    links = links.map{|link| UriHelper.join_no_fragment(@url, UriHelper.join_no_fragment(@base_url, link))}
+      .reject(&:nil?)
+      .map(&:to_s)
     links = links.reject{|link| link =~ /\/([^\/]+?)\/\1\// }
     links = links.reject{|link| link =~ /([^\/]+?)\/([^\/]+?)\/.*?\1\/\2/ }    
     links = links.select{|link| options[:valid_schemes].include? link.split(':')[0].to_sym}
     links
   end