test/test_sanitize.rb in sanitize-2.1.1 vs test/test_sanitize.rb in sanitize-3.0.0

- old
+ new

@@ -1,721 +1,93 @@ # encoding: utf-8 -#-- -# Copyright (c) 2013 Ryan Grove <ryan@wonko.com> -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the 'Software'), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -#++ +require_relative 'common' -require 'rubygems' -gem 'minitest' - -require 'minitest/autorun' -require 'sanitize' - -strings = { - :basic => { - :html => '<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>', - :default => 'Lorem ipsum dolor sit amet alert("hello world");', - :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet alert("hello world");', - :basic => '<b>Lorem</b> <a href="pants" rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");', - :relaxed => '<b>Lorem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet alert("hello world");' - }, - - :malformed => { - :html => 'Lo<!-- comment -->rem</b> <a href=pants title="foo>ipsum <a href="http://foo.com/"><strong>dolor</a></strong> sit<br/>amet <script>alert("hello world");', - :default => 'Lorem dolor sit amet alert("hello world");', - :restricted => 'Lorem <strong>dolor</strong> sit amet alert("hello world");', - :basic => 'Lorem <a href="pants" rel="nofollow"><strong>dolor</strong></a> sit<br>amet alert("hello world");', - :relaxed => 'Lorem <a href="pants" title="foo&gt;ipsum &lt;a href="><strong>dolor</strong></a> sit<br>amet alert("hello world");', - :document => ' Lorem dolor sit amet alert("hello world"); ' - }, - - :unclosed => { - :html => '<p>a</p><blockquote>b', - :default => ' a b ', - :restricted => ' a b ', - :basic => '<p>a</p><blockquote>b</blockquote>', - :relaxed => '<p>a</p><blockquote>b</blockquote>' - }, - - :malicious => { - :html => '<b>Lo<!-- comment -->rem</b> <a href="javascript:pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <<foo>script>alert("hello world");</script>', - :default => 'Lorem ipsum dolor sit amet &lt;script&gt;alert("hello world");', - :restricted => '<b>Lorem</b> ipsum <strong>dolor</strong> sit amet &lt;script&gt;alert("hello world");', - :basic => '<b>Lorem</b> <a rel="nofollow">ipsum</a> <a href="http://foo.com/" rel="nofollow"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");', - :relaxed => '<b>Lorem</b> <a title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br>amet &lt;script&gt;alert("hello world");' - }, - - :raw_comment => { - :html => '<!-- comment -->Hello', - :default => 'Hello', - :restricted => 'Hello', - :basic => 'Hello', - :relaxed => 'Hello', - :document => ' Hello ', - } -} - -tricky = { - 'protocol-based JS injection: simple, no spaces' => { - :html => '<a href="javascript:alert(\'XSS\');">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: simple, spaces before' => { - :html => '<a href="javascript :alert(\'XSS\');">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: simple, spaces after' => { - :html => '<a href="javascript: alert(\'XSS\');">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: simple, spaces before and after' => { - :html => '<a href="javascript : alert(\'XSS\');">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: preceding colon' => { - :html => '<a href=":javascript:alert(\'XSS\');">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: UTF-8 encoding' => { - :html => '<a href="javascript&#58;">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: long UTF-8 encoding' => { - :html => '<a href="javascript&#0058;">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: long UTF-8 encoding without semicolons' => { - :html => '<a href=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: hex encoding' => { - :html => '<a href="javascript&#x3A;">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: long hex encoding' => { - :html => '<a href="javascript&#x003A;">foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: hex encoding without semicolons' => { - :html => '<a href=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>foo</a>', - :default => 'foo', - :restricted => 'foo', - :basic => '<a rel="nofollow">foo</a>', - :relaxed => '<a>foo</a>' - }, - - 'protocol-based JS injection: null char' => { - :html => "<img src=java\0script:alert(\"XSS\")>", - :default => '', - :restricted => '', - :basic => '', - :relaxed => '<img src="java">' # everything following the null char gets stripped, and URL is considered relative - }, - - 'protocol-based JS injection: invalid URL char' => { - :html => '<img src=java\script:alert("XSS")>', - :default => '', - :restricted => '', - :basic => '', - :relaxed => '<img>' - }, - - 'protocol-based JS injection: spaces and entities' => { - :html => '<img src=" &#14; javascript:alert(\'XSS\');">', - :default => '', - :restricted => '', - :basic => '', - :relaxed => '<img src>' - } -} - -describe 'Config::DEFAULT' do - it 'should translate valid HTML entities' do - Sanitize.clean("Don&apos;t tas&eacute; me &amp; bro!").must_equal("Don't tasé me &amp; bro!") - end - - it 'should translate valid HTML entities while encoding unencoded ampersands' do - Sanitize.clean("cookies&sup2; & &frac14; cr&eacute;me").must_equal("cookies² &amp; ¼ créme") - end - - it 'should never output &apos;' do - Sanitize.clean("<a href='&apos;' class=\"' &#39;\">IE6 isn't a real browser</a>").wont_match(/&apos;/) - end - - it 'should not choke on several instances of the same element in a row' do - Sanitize.clean('<img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif"><img src="http://www.google.com/intl/en_ALL/images/logo.gif">').must_equal('') - end - - it 'should surround the contents of :whitespace_elements with space characters when removing the element' do - Sanitize.clean('foo<div>bar</div>baz').must_equal('foo bar baz') - Sanitize.clean('foo<br>bar<br>baz').must_equal('foo bar baz') - Sanitize.clean('foo<hr>bar<hr>baz').must_equal('foo bar baz') - end - - strings.each do |name, data| - it "should clean #{name} HTML" do - Sanitize.clean(data[:html]).must_equal(data[:default]) +describe 'Sanitize' do + describe 'instance methods' do + before do + @s = Sanitize.new end - end - tricky.each do |name, data| - it "should not allow #{name}" do - Sanitize.clean(data[:html]).must_equal(data[:default]) - end - end -end + describe '#document' do + before do + @s = Sanitize.new(:elements => ['html']) + end -describe 'Config::RESTRICTED' do - before { @s = Sanitize.new(Sanitize::Config::RESTRICTED) } + it 'should sanitize an HTML document' do + @s.document('<!doctype html><html><b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script></html>') + .must_equal "<html>Lorem ipsum dolor sit amet alert(\"hello world\");</html>\n" + end - strings.each do |name, data| - it "should clean #{name} HTML" do - @s.clean(data[:html]).must_equal(data[:restricted]) + it 'should not modify the input string' do + input = '<!DOCTYPE html><b>foo</b>' + @s.document(input) + input.must_equal('<!DOCTYPE html><b>foo</b>') + end end - end - tricky.each do |name, data| - it "should not allow #{name}" do - @s.clean(data[:html]).must_equal(data[:restricted]) - end - end -end + describe '#fragment' do + it 'should sanitize an HTML fragment' do + @s.fragment('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>') + .must_equal 'Lorem ipsum dolor sit amet alert("hello world");' + end -describe 'Config::BASIC' do - before { @s = Sanitize.new(Sanitize::Config::BASIC) } + it 'should not modify the input string' do + input = '<b>foo</b>' + @s.fragment(input) + input.must_equal '<b>foo</b>' + end - it 'should not choke on valueless attributes' do - @s.clean('foo <a href>foo</a> bar').must_equal('foo <a href rel="nofollow">foo</a> bar') - end - - it 'should downcase attribute names' do - @s.clean('<a HREF="javascript:alert(\'foo\')">bar</a>').must_equal('<a rel="nofollow">bar</a>') - end - - strings.each do |name, data| - it "should clean #{name} HTML" do - @s.clean(data[:html]).must_equal(data[:basic]) + it 'should not choke on fragments containing <html> or <body>' do + @s.fragment('<html><b>foo</b></html>').must_equal 'foo' + @s.fragment('<body><b>foo</b></body>').must_equal 'foo' + @s.fragment('<html><body><b>foo</b></body></html>').must_equal 'foo' + @s.fragment('<!DOCTYPE html><html><body><b>foo</b></body></html>').must_equal 'foo' + end end - end - tricky.each do |name, data| - it "should not allow #{name}" do - @s.clean(data[:html]).must_equal(data[:basic]) - end - end -end + describe '#node!' do + it 'should sanitize a Nokogiri::XML::Node' do + doc = Nokogiri::HTML5.parse('<b>Lo<!-- comment -->rem</b> <a href="pants" title="foo">ipsum</a> <a href="http://foo.com/"><strong>dolor</strong></a> sit<br/>amet <script>alert("hello world");</script>') + frag = doc.fragment -describe 'Config::RELAXED' do - before { @s = Sanitize.new(Sanitize::Config::RELAXED) } + doc.xpath('/html/body/node()').each {|node| frag << node } - it 'should encode special chars in attribute values' do - input = '<a href="http://example.com" title="<b>&eacute;xamples</b> & things">foo</a>' - output = Nokogiri::HTML.fragment('<a href="http://example.com" title="&lt;b&gt;éxamples&lt;/b&gt; &amp; things">foo</a>').to_xhtml(:encoding => 'utf-8', :indent => 0, :save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML) - @s.clean(input).must_equal(output) - end + @s.node!(frag) + frag.to_html.must_equal 'Lorem ipsum dolor sit amet alert("hello world");' + end - strings.each do |name, data| - it "should clean #{name} HTML" do - @s.clean(data[:html]).must_equal(data[:relaxed]) + describe "when the given node is a document and <html> isn't whitelisted" do + it 'should raise a Sanitize::Error' do + doc = Nokogiri::HTML5.parse('foo') + proc { @s.node!(doc) }.must_raise Sanitize::Error + end + end end end - tricky.each do |name, data| - it "should not allow #{name}" do - @s.clean(data[:html]).must_equal(data[:relaxed]) + describe 'class methods' do + describe '.document' do + it 'should call #document' do + Sanitize.stub_instance(:document, proc {|html| html + ' called' }) do + Sanitize.document('<html>foo</html>') + .must_equal '<html>foo</html> called' + end + end end - end -end -describe 'Full Document parser (using clean_document)' do - before { - @s = Sanitize.new({:elements => %w[!DOCTYPE html]}) - @default_doctype = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">" - } - - it 'should require HTML element is whitelisted to prevent parser errors' do - assert_raises(RuntimeError, 'You must have the HTML element whitelisted') { - Sanitize.clean_document!('', {:elements => [], :remove_contents => false}) - } - end - - it 'should NOT require HTML element to be whitelisted if remove_contents is true' do - output = '<!DOCTYPE html><html>foo</html>' - Sanitize.clean_document!(output, {:remove_contents => true}).must_equal "<!DOCTYPE html>\n\n" - end - - it 'adds a doctype tag if not included' do - @s.clean_document('').must_equal("#{@default_doctype}\n\n") - end - - it 'should apply whitelist filtering to HTML element' do - output = "<!DOCTYPE html>\n<html anything='false'></html>\n\n" - @s.clean_document(output).must_equal("<!DOCTYPE html>\n<html></html>\n") - end - - strings.each do |name, data| - it "should wrap #{name} with DOCTYPE and HTML tag" do - output = data[:document] || data[:default] - @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n<html>#{output}</html>\n") - end - end - - tricky.each do |name, data| - it "should wrap #{name} with DOCTYPE and HTML tag" do - @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n<html>#{data[:default]}</html>\n") - end - end -end - -describe 'Custom configs' do - it 'should allow attributes on all elements if whitelisted under :all' do - input = '<p class="foo">bar</p>' - - Sanitize.clean(input).must_equal(' bar ') - Sanitize.clean(input, {:elements => ['p'], :attributes => {:all => ['class']}}).must_equal(input) - Sanitize.clean(input, {:elements => ['p'], :attributes => {'div' => ['class']}}).must_equal('<p>bar</p>') - Sanitize.clean(input, {:elements => ['p'], :attributes => {'p' => ['title'], :all => ['class']}}).must_equal(input) - end - - it 'should allow comments when :allow_comments == true' do - input = 'foo <!-- bar --> baz' - Sanitize.clean(input).must_equal('foo baz') - Sanitize.clean(input, :allow_comments => true).must_equal(input) - end - - it 'should allow relative URLs containing colons where the colon is not in the first path segment' do - input = '<a href="/wiki/Special:Random">Random Page</a>' - Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) - end - - it 'should allow relative URLs containing colons where the colon is part of an anchor' do - input = '<a href="#fn:1">Footnote 1</a>' - Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) - end - - it 'should allow relative URLs containing colons where the colon is part of an anchor' do - input = '<a href="somepage#fn:1">Footnote 1</a>' - Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) - end - - it 'should output HTML when :output == :html' do - input = 'foo<br/>bar<br>baz' - Sanitize.clean(input, :elements => ['br'], :output => :html).must_equal('foo<br>bar<br>baz') - end - - it 'should remove the contents of filtered nodes when :remove_contents == true' do - Sanitize.clean('foo bar <div>baz<span>quux</span></div>', :remove_contents => true).must_equal('foo bar ') - end - - it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do - Sanitize.clean('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>', :remove_contents => ['script', 'span']).must_equal('foo bar baz ') - end - - it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do - Sanitize.clean('foo bar <div>baz<span>quux</span><script>alert("hello!");</script></div>', :remove_contents => [:script, :span]).must_equal('foo bar baz ') - end - - it 'should support encodings other than utf-8' do - html = 'foo&nbsp;bar' - Sanitize.clean(html).must_equal("foo\302\240bar") - Sanitize.clean(html, :output_encoding => 'ASCII').must_equal("foo&#160;bar") - end - - it 'should not allow arbitrary HTML5 data attributes by default' do - config = { - :elements => ['b'] - } - - Sanitize.clean('<b data-foo="bar"></b>', config) - .must_equal('<b></b>') - - config[:attributes] = {'b' => ['class']} - - Sanitize.clean('<b class="foo" data-foo="bar"></b>', config) - .must_equal('<b class="foo"></b>') - end - - it 'should allow arbitrary HTML5 data attributes when the :attributes config includes :data' do - config = { - :attributes => {'b' => [:data]}, - :elements => ['b'] - } - - Sanitize.clean('<b data-foo="valid" data-bar="valid"></b>', config) - .must_equal('<b data-foo="valid" data-bar="valid"></b>') - - Sanitize.clean('<b data-="invalid"></b>', config) - .must_equal('<b></b>') - - Sanitize.clean('<b data-="invalid"></b>', config) - .must_equal('<b></b>') - - Sanitize.clean('<b data-xml="invalid"></b>', config) - .must_equal('<b></b>') - - Sanitize.clean('<b data-xmlfoo="invalid"></b>', config) - .must_equal('<b></b>') - - Sanitize.clean('<b data-f:oo="valid"></b>', config) - .must_equal('<b></b>') - - Sanitize.clean('<b data-f/oo="partial"></b>', config) - .must_equal('<b data-f></b>') # Nokogiri quirk; not ideal, but harmless - - Sanitize.clean('<b data-éfoo="valid"></b>', config) - .must_equal('<b></b>') # Another annoying Nokogiri quirk. - end -end - -describe 'Sanitize.clean' do - it 'should not modify the input string' do - input = '<b>foo</b>' - Sanitize.clean(input) - input.must_equal('<b>foo</b>') - end - - it 'should return a new string' do - input = '<b>foo</b>' - Sanitize.clean(input).must_equal('foo') - end -end - -describe 'Sanitize.clean!' do - it 'should modify the input string' do - input = '<b>foo</b>' - Sanitize.clean!(input) - input.must_equal('foo') - end - - it 'should return the string if it was modified' do - input = '<b>foo</b>' - Sanitize.clean!(input).must_equal('foo') - end - - it 'should return nil if the string was not modified' do - input = 'foo' - Sanitize.clean!(input).must_equal(nil) - end -end - -describe 'Sanitize.clean_document' do - before { @config = { :elements => ['html', 'p'] } } - - it 'should be idempotent' do - input = '<!DOCTYPE html><html><p>foo</p></html>' - first = Sanitize.clean_document(input, @config) - second = Sanitize.clean_document(first, @config) - second.must_equal first - second.wont_be_nil - end - - it 'should handle nil without raising' do - Sanitize.clean_document(nil).must_equal nil - end - - it 'should not modify the input string' do - input = '<!DOCTYPE html><b>foo</b>' - Sanitize.clean_document(input, @config) - input.must_equal('<!DOCTYPE html><b>foo</b>') - end - - it 'should return a new string' do - input = '<!DOCTYPE html><b>foo</b>' - Sanitize.clean_document(input, @config).must_equal("<!DOCTYPE html>\n<html>foo</html>\n") - end -end - -describe 'Sanitize.clean_document!' do - before { @config = { :elements => ['html'] } } - - it 'should modify the input string' do - input = '<!DOCTYPE html><html><body><b>foo</b></body></html>' - Sanitize.clean_document!(input, @config) - input.must_equal("<!DOCTYPE html>\n<html>foo</html>\n") - end - - it 'should return the string if it was modified' do - input = '<!DOCTYPE html><html><body><b>foo</b></body></html>' - Sanitize.clean_document!(input, @config).must_equal("<!DOCTYPE html>\n<html>foo</html>\n") - end - - it 'should return nil if the string was not modified' do - input = "<!DOCTYPE html>\n<html></html>\n" - Sanitize.clean_document!(input, @config).must_equal(nil) - end -end - -describe 'transformers' do - # YouTube embed transformer. - youtube = lambda do |env| - node = env[:node] - node_name = env[:node_name] - - # Don't continue if this node is already whitelisted or is not an element. - return if env[:is_whitelisted] || !node.element? - - # Don't continue unless the node is an iframe. - return unless node_name == 'iframe' - - # Verify that the video URL is actually a valid YouTube video URL. - return unless node['src'] =~ /\Ahttps?:\/\/(?:www\.)?youtube(?:-nocookie)?\.com\// - - # We're now certain that this is a YouTube embed, but we still need to run - # it through a special Sanitize step to ensure that no unwanted elements or - # attributes that don't belong in a YouTube embed can sneak in. - Sanitize.clean_node!(node, { - :elements => %w[iframe], - - :attributes => { - 'iframe' => %w[allowfullscreen frameborder height src width] - } - }) - - # Now that we're sure that this is a valid YouTube embed and that there are - # no unwanted elements or attributes hidden inside it, we can tell Sanitize - # to whitelist the current node. - {:node_whitelist => [node]} - end - - it 'should receive a complete env Hash as input' do - Sanitize.clean!('<SPAN>foo</SPAN>', :foo => :bar, :transformers => lambda {|env| - return unless env[:node].element? - - env[:config][:foo].must_equal(:bar) - env[:is_whitelisted].must_equal(false) - env[:node].must_be_kind_of(Nokogiri::XML::Node) - env[:node_name].must_equal('span') - env[:node_whitelist].must_be_kind_of(Set) - env[:node_whitelist].must_be_empty - }) - end - - it 'should traverse all node types, including the fragment itself' do - nodes = [] - - Sanitize.clean!('<div>foo</div><!--bar--><script>cdata!</script>', :transformers => proc {|env| - nodes << env[:node_name] - }) - - nodes.must_equal(%w[ - text div comment #cdata-section script #document-fragment - ]) - end - - it 'should traverse in depth-first mode by default' do - nodes = [] - - Sanitize.clean!('<div><span>foo</span></div><p>bar</p>', :transformers => proc {|env| - env[:traversal_mode].must_equal(:depth) - nodes << env[:node_name] if env[:node].element? - }) - - nodes.must_equal(['span', 'div', 'p']) - end - - it 'should traverse in breadth-first mode when using :transformers_breadth' do - nodes = [] - - Sanitize.clean!('<div><span>foo</span></div><p>bar</p>', :transformers_breadth => proc {|env| - env[:traversal_mode].must_equal(:breadth) - nodes << env[:node_name] if env[:node].element? - }) - - nodes.must_equal(['div', 'span', 'p']) - end - - it 'should whitelist nodes in the node whitelist' do - Sanitize.clean!('<div class="foo">foo</div><span>bar</span>', :transformers => [ - proc {|env| - {:node_whitelist => [env[:node]]} if env[:node_name] == 'div' - }, - - proc {|env| - env[:is_whitelisted].must_equal(false) unless env[:node_name] == 'div' - env[:is_whitelisted].must_equal(true) if env[:node_name] == 'div' - env[:node_whitelist].must_include(env[:node]) if env[:node_name] == 'div' - } - ]).must_equal('<div class="foo">foo</div>bar') - end - - it 'should clear the node whitelist after each fragment' do - called = false - - Sanitize.clean!('<div>foo</div>', :transformers => proc {|env| - {:node_whitelist => [env[:node]]} - }) - - Sanitize.clean!('<div>foo</div>', :transformers => proc {|env| - called = true - env[:is_whitelisted].must_equal(false) - env[:node_whitelist].must_be_empty - }) - - called.must_equal(true) - end - - it 'should allow youtube video embeds via the youtube transformer' do - input = '<iframe width="420" height="315" src="http://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>' - output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="http://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0) - - Sanitize.clean!(input, :transformers => youtube).must_equal(output) - end - - it 'should allow https youtube video embeds via the youtube transformer' do - input = '<iframe width="420" height="315" src="https://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>' - output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="https://www.youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0) - - Sanitize.clean!(input, :transformers => youtube).must_equal(output) - end - - it 'should allow privacy-enhanced youtube video embeds via the youtube transformer' do - input = '<iframe width="420" height="315" src="http://www.youtube-nocookie.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen bogus="bogus"><script>alert()</script></iframe>' - output = Nokogiri::HTML::DocumentFragment.parse('<iframe width="420" height="315" src="http://www.youtube-nocookie.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen>alert()</iframe>').to_html(:encoding => 'utf-8', :indent => 0) - - Sanitize.clean!(input, :transformers => youtube).must_equal(output) - end - - it 'should not allow non-youtube video embeds via the youtube transformer' do - input = '<iframe width="420" height="315" src="http://www.fake-youtube.com/embed/QH2-TGUlwu4" frameborder="0" allowfullscreen></iframe>' - output = '' - - Sanitize.clean!(input, :transformers => youtube).must_equal(output) - end -end - -describe 'bugs' do - it 'should not have Nokogiri 1.4.2+ unterminated script/style element bug' do - Sanitize.clean!('foo <script>bar').must_equal('foo bar') - Sanitize.clean!('foo <style>bar').must_equal('foo bar') - end -end - -describe 'Malicious HTML' do - make_my_diffs_pretty! - parallelize_me! - - before do - @s = Sanitize.new(Sanitize::Config::RELAXED) - end - - # libxml2 >= 2.9.2 doesn't escape comments within some attributes, in an - # attempt to preserve server-side includes. This can result in XSS since an - # unescaped double quote can allow an attacker to inject a non-whitelisted - # attribute. Sanitize works around this by implementing its own escaping for - # affected attributes. - # - # The relevant libxml2 code is here: - # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588> - describe 'unsafe libxml2 server-side includes in attributes' do - tag_configs = [ - { - tag_name: 'a', - escaped_attrs: %w[ action href src name ], - unescaped_attrs: [] - }, - - { - tag_name: 'div', - escaped_attrs: %w[ action href src ], - unescaped_attrs: %w[ name ] - } - ] - - before do - @s = Sanitize.new({ - elements: %w[ a div ], - - attributes: { - all: %w[ action href src name ] - } - }) - end - - tag_configs.each do |tag_config| - tag_name = tag_config[:tag_name] - - tag_config[:escaped_attrs].each do |attr_name| - input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>] - - it 'should escape unsafe characters in attributes' do - @s.clean(input).must_equal(%[<#{tag_name} #{attr_name}="examp<!--%22%20onmouseover=alert(1)>-->le.com">foo</#{tag_name}>]) + describe '.fragment' do + it 'should call #fragment' do + Sanitize.stub_instance(:fragment, proc {|html| html + ' called' }) do + Sanitize.fragment('<b>foo</b>').must_equal '<b>foo</b> called' end - - it 'should round-trip to the same output' do - output = @s.clean(input) - @s.clean(output).must_equal(output) - end end + end - tag_config[:unescaped_attrs].each do |attr_name| - input = %[<#{tag_name} #{attr_name}='examp<!--" onmouseover=alert(1)>-->le.com'>foo</#{tag_name}>] - - it 'should not escape characters unnecessarily' do - @s.clean(input).must_equal(input) + describe '.node!' do + it 'should call #node!' do + Sanitize.stub_instance(:node!, proc {|input| input + ' called' }) do + Sanitize.node!('not really a node').must_equal 'not really a node called' end - - it 'should round-trip to the same output' do - output = @s.clean(input) - @s.clean(output).must_equal(output) - end end end end end -