lib/spidr/page.rb in spidr-0.2.0 vs lib/spidr/page.rb in spidr-0.2.1

- old
+ new

@@ -172,10 +172,20 @@ def xml? (content_type =~ /text\/xml/) == 0 end # + # Determines if the page is XML Stylesheet (XSL). + # + # @return [Boolean] + # Specifies whether the page is XML Stylesheet (XSL). + # + def xsl? + (content_type =~ /text\/xsl/) == 0 + end + + # # Determines if the page is JavaScript. # # @return [Boolean] # Specifies whether the page is JavaScript. # @@ -259,17 +269,20 @@ # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil] # The document that represents HTML or XML pages. # Returns +nil+ if the page is neither HTML, XML, RSS, Atom or if # the page could not be parsed properly. # + # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html + # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html + # def doc return nil if (body.nil? || body.empty?) begin if html? return @doc ||= Nokogiri::HTML(body) - elsif (xml? || rss? || atom?) + elsif (xml? || xsl? || rss? || atom?) return @doc ||= Nokogiri::XML(body) end rescue return nil end @@ -370,9 +383,17 @@ add_url.call(iframe.get_attribute('src')) end doc.search('iframe[@src]').each do |iframe| add_url.call(iframe.get_attribute('src')) + end + + doc.search('link[@href]').each do |link| + add_url.call(link.get_attribute('href')) + end + + doc.search('script[@src]').each do |script| + add_url.call(script.get_attribute('src')) end end return urls end