lib/spidr/page.rb in spidr-0.2.0 vs lib/spidr/page.rb in spidr-0.2.1
- old
+ new
@@ -172,10 +172,20 @@
def xml?
(content_type =~ /text\/xml/) == 0
end
#
+ # Determines if the page is XML Stylesheet (XSL).
+ #
+ # @return [Boolean]
+ # Specifies whether the page is XML Stylesheet (XSL).
+ #
+ def xsl?
+ (content_type =~ /text\/xsl/) == 0
+ end
+
+ #
# Determines if the page is JavaScript.
#
# @return [Boolean]
# Specifies whether the page is JavaScript.
#
@@ -259,17 +269,20 @@
# @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
# The document that represents HTML or XML pages.
# Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if
# the page could not be parsed properly.
#
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
+ # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
+ #
def doc
return nil if (body.nil? || body.empty?)
begin
if html?
return @doc ||= Nokogiri::HTML(body)
- elsif (xml? || rss? || atom?)
+ elsif (xml? || xsl? || rss? || atom?)
return @doc ||= Nokogiri::XML(body)
end
rescue
return nil
end
@@ -370,9 +383,17 @@
add_url.call(iframe.get_attribute('src'))
end
doc.search('iframe[@src]').each do |iframe|
add_url.call(iframe.get_attribute('src'))
+ end
+
+ doc.search('link[@href]').each do |link|
+ add_url.call(link.get_attribute('href'))
+ end
+
+ doc.search('script[@src]').each do |script|
+ add_url.call(script.get_attribute('src'))
end
end
return urls
end