lib/spidr/page.rb in spidr-0.1.4 vs lib/spidr/page.rb in spidr-0.1.5

- old
+ new

@@ -190,24 +190,28 @@ # <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object # will be returned. Other content-types will cause +nil+ to be # returned. # def doc - if html? - return @doc ||= Nokogiri::HTML(body) - elsif xml? - return @doc ||= Nokogiri::XML(body) + begin + if html? + return @doc ||= Nokogiri::HTML(body) + elsif xml? + return @doc ||= Nokogiri::XML(body) + end + rescue + return nil end end # # Returns all links from the HTML page. # def links urls = [] - if html? + if (html? && self.doc) self.doc.search('a[@href]').each do |a| url = a.get_attribute('href') urls << url unless url.empty? end @@ -218,11 +222,11 @@ # # Returns all links from the HtML page as absolute URLs. # def urls - links.map { |link| to_absolute(link) } + links.map { |link| to_absolute(link) }.compact end protected # @@ -231,23 +235,27 @@ # def to_absolute(link) # clean the link link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) - relative = URI(link) - absolute = @url.merge(relative) + begin + relative = URI(link) + absolute = @url.merge(relative) - if absolute.path - if absolute.path.empty? - # default the absolute path to '/' - absolute.path = '/' - else - # make sure the path does not contain any .. or . directories. - absolute.path = File.expand_path(absolute.path) + if absolute.path + if absolute.path.empty? + # default the absolute path to '/' + absolute.path = '/' + else + # make sure the path does not contain any .. or . directories. + absolute.path = File.expand_path(absolute.path) + end end - end - return absolute + return absolute + rescue URI::InvalidURIError => e + return nil + end end # # Provides transparent access to the values in the +headers+ +Hash+. #