lib/spidr/page.rb in spidr-0.1.4 vs lib/spidr/page.rb in spidr-0.1.5
- old
+ new
@@ -190,24 +190,28 @@
# <tt>text/xml</tt> content-type, a Nokogiri::XML::Document object
# will be returned. Other content-types will cause +nil+ to be
# returned.
#
def doc
- if html?
- return @doc ||= Nokogiri::HTML(body)
- elsif xml?
- return @doc ||= Nokogiri::XML(body)
+ begin
+ if html?
+ return @doc ||= Nokogiri::HTML(body)
+ elsif xml?
+ return @doc ||= Nokogiri::XML(body)
+ end
+ rescue
+ return nil
end
end
#
# Returns all links from the HTML page.
#
def links
urls = []
- if html?
+ if (html? && self.doc)
self.doc.search('a[@href]').each do |a|
url = a.get_attribute('href')
urls << url unless url.empty?
end
@@ -218,11 +222,11 @@
#
# Returns all links from the HtML page as absolute URLs.
#
def urls
- links.map { |link| to_absolute(link) }
+ links.map { |link| to_absolute(link) }.compact
end
protected
#
@@ -231,23 +235,27 @@
#
def to_absolute(link)
# clean the link
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
- relative = URI(link)
- absolute = @url.merge(relative)
+ begin
+ relative = URI(link)
+ absolute = @url.merge(relative)
- if absolute.path
- if absolute.path.empty?
- # default the absolute path to '/'
- absolute.path = '/'
- else
- # make sure the path does not contain any .. or . directories.
- absolute.path = File.expand_path(absolute.path)
+ if absolute.path
+ if absolute.path.empty?
+ # default the absolute path to '/'
+ absolute.path = '/'
+ else
+ # make sure the path does not contain any .. or . directories.
+ absolute.path = File.expand_path(absolute.path)
+ end
end
- end
- return absolute
+ return absolute
+ rescue URI::InvalidURIError => e
+ return nil
+ end
end
#
# Provides transparent access to the values in the +headers+ +Hash+.
#