# encoding: utf-8 #-- # Copyright (c) 2013 Ryan Grove # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the 'Software'), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. #++ require 'rubygems' gem 'minitest' require 'minitest/autorun' require 'sanitize' strings = { :basic => { :html => 'Lorem ipsum dolor sit
amet ', :default => 'Lorem ipsum dolor sit amet alert("hello world");', :restricted => 'Lorem ipsum dolor sit amet alert("hello world");', :basic => 'Lorem ipsum dolor sit
amet alert("hello world");', :relaxed => 'Lorem ipsum dolor sit
amet alert("hello world");' }, :malformed => { :html => 'Lorem dolor sit
amet ', :default => 'Lorem ipsum dolor sit amet script>alert("hello world");', :restricted => 'Lorem ipsum dolor sit amet script>alert("hello world");', :basic => 'Lorem ipsum dolor sit
amet script>alert("hello world");', :relaxed => 'Lorem ipsum dolor sit
amet script>alert("hello world");' }, :raw_comment => { :html => 'Hello', :default => 'Hello', :restricted => 'Hello', :basic => 'Hello', :relaxed => 'Hello', :document => ' Hello ', } } tricky = { 'protocol-based JS injection: simple, no spaces' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces before' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces after' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: simple, spaces before and after' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: preceding colon' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: UTF-8 encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long UTF-8 encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long UTF-8 encoding without semicolons' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: hex encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: long hex encoding' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: hex encoding without semicolons' => { :html => 'foo', :default => 'foo', :restricted => 'foo', :basic => 'foo', :relaxed => 'foo' }, 'protocol-based JS injection: null char' => { :html => "

", :default => '', :restricted => '', :basic => '', :relaxed => '

' # everything following the null char gets stripped, and URL is considered relative }, 'protocol-based JS injection: invalid URL char' => { :html => '

', :default => '', :restricted => '', :basic => '', :relaxed => '' }, 'protocol-based JS injection: spaces and entities' => { :html => '

', :default => '', :restricted => '', :basic => '', :relaxed => '' } } describe 'Config::DEFAULT' do it 'should translate valid HTML entities' do Sanitize.clean("Don't tasé me & bro!").must_equal("Don't tasé me & bro!") end it 'should translate valid HTML entities while encoding unencoded ampersands' do Sanitize.clean("cookies² & ¼ créme").must_equal("cookies² & ¼ créme") end it 'should never output '' do Sanitize.clean("IE6 isn't a real browser").wont_match(/'/) end it 'should not choke on several instances of the same element in a row' do Sanitize.clean('

').must_equal('') end it 'should surround the contents of :whitespace_elements with space characters when removing the element' do Sanitize.clean('foo

bar

baz').must_equal('foo bar baz') Sanitize.clean('foo
bar
baz').must_equal('foo bar baz') Sanitize.clean('foo

bar

baz').must_equal('foo bar baz') end strings.each do |name, data| it "should clean #{name} HTML" do Sanitize.clean(data[:html]).must_equal(data[:default]) end end tricky.each do |name, data| it "should not allow #{name}" do Sanitize.clean(data[:html]).must_equal(data[:default]) end end end describe 'Config::RESTRICTED' do before { @s = Sanitize.new(Sanitize::Config::RESTRICTED) } strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:restricted]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:restricted]) end end end describe 'Config::BASIC' do before { @s = Sanitize.new(Sanitize::Config::BASIC) } it 'should not choke on valueless attributes' do @s.clean('foo foo bar').must_equal('foo foo bar') end it 'should downcase attribute names' do @s.clean('bar').must_equal('bar') end strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:basic]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:basic]) end end end describe 'Config::RELAXED' do before { @s = Sanitize.new(Sanitize::Config::RELAXED) } it 'should encode special chars in attribute values' do input = 'foo' output = Nokogiri::HTML.fragment('foo').to_xhtml(:encoding => 'utf-8', :indent => 0, :save_with => Nokogiri::XML::Node::SaveOptions::AS_XHTML) @s.clean(input).must_equal(output) end strings.each do |name, data| it "should clean #{name} HTML" do @s.clean(data[:html]).must_equal(data[:relaxed]) end end tricky.each do |name, data| it "should not allow #{name}" do @s.clean(data[:html]).must_equal(data[:relaxed]) end end end describe 'Full Document parser (using clean_document)' do before { @s = Sanitize.new({:elements => %w[!DOCTYPE html]}) @default_doctype = "" } it 'should require HTML element is whitelisted to prevent parser errors' do assert_raises(RuntimeError, 'You must have the HTML element whitelisted') { Sanitize.clean_document!('', {:elements => [], :remove_contents => false}) } end it 'should NOT require HTML element to be whitelisted if remove_contents is true' do output = 'foo' Sanitize.clean_document!(output, {:remove_contents => true}).must_equal "\n\n" end it 'adds a doctype tag if not included' do @s.clean_document('').must_equal("#{@default_doctype}\n\n") end it 'should apply whitelist filtering to HTML element' do output = "\n\n\n" @s.clean_document(output).must_equal("\n\n") end strings.each do |name, data| it "should wrap #{name} with DOCTYPE and HTML tag" do output = data[:document] || data[:default] @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n#{output}\n") end end tricky.each do |name, data| it "should wrap #{name} with DOCTYPE and HTML tag" do @s.clean_document(data[:html]).must_equal("#{@default_doctype}\n#{data[:default]}\n") end end end describe 'Custom configs' do it 'should allow attributes on all elements if whitelisted under :all' do input = '

bar

' Sanitize.clean(input).must_equal(' bar ') Sanitize.clean(input, {:elements => ['p'], :attributes => {:all => ['class']}}).must_equal(input) Sanitize.clean(input, {:elements => ['p'], :attributes => {'div' => ['class']}}).must_equal('

bar

') Sanitize.clean(input, {:elements => ['p'], :attributes => {'p' => ['title'], :all => ['class']}}).must_equal(input) end it 'should allow comments when :allow_comments == true' do input = 'foo baz' Sanitize.clean(input).must_equal('foo baz') Sanitize.clean(input, :allow_comments => true).must_equal(input) end it 'should allow relative URLs containing colons where the colon is not in the first path segment' do input = 'Random Page' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should allow relative URLs containing colons where the colon is part of an anchor' do input = 'Footnote 1' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should allow relative URLs containing colons where the colon is part of an anchor' do input = 'Footnote 1' Sanitize.clean(input, { :elements => ['a'], :attributes => {'a' => ['href']}, :protocols => { 'a' => { 'href' => [:relative] }} }).must_equal(input) end it 'should output HTML when :output == :html' do input = 'foo
bar
baz' Sanitize.clean(input, :elements => ['br'], :output => :html).must_equal('foo
bar
baz') end it 'should remove the contents of filtered nodes when :remove_contents == true' do Sanitize.clean('foo bar

bazquux

', :remove_contents => true).must_equal('foo bar ') end it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as strings' do Sanitize.clean('foo bar

bazquux

', :remove_contents => ['script', 'span']).must_equal('foo bar baz ') end it 'should remove the contents of specified nodes when :remove_contents is an Array of element names as symbols' do Sanitize.clean('foo bar

bazquux

', :remove_contents => [:script, :span]).must_equal('foo bar baz ') end it 'should support encodings other than utf-8' do html = 'foo bar' Sanitize.clean(html).must_equal("foo\302\240bar") Sanitize.clean(html, :output_encoding => 'ASCII').must_equal("foo bar") end it 'should not allow arbitrary HTML5 data attributes by default' do config = { :elements => ['b'] } Sanitize.clean('', config) .must_equal('') config[:attributes] = {'b' => ['class']} Sanitize.clean('', config) .must_equal('') end it 'should allow arbitrary HTML5 data attributes when the :attributes config includes :data' do config = { :attributes => {'b' => [:data]}, :elements => ['b'] } Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') Sanitize.clean('', config) .must_equal('') # Nokogiri quirk; not ideal, but harmless Sanitize.clean('', config) .must_equal('') # Another annoying Nokogiri quirk. end end describe 'Sanitize.clean' do it 'should not modify the input string' do input = 'foo' Sanitize.clean(input) input.must_equal('foo') end it 'should return a new string' do input = 'foo' Sanitize.clean(input).must_equal('foo') end end describe 'Sanitize.clean!' do it 'should modify the input string' do input = 'foo' Sanitize.clean!(input) input.must_equal('foo') end it 'should return the string if it was modified' do input = 'foo' Sanitize.clean!(input).must_equal('foo') end it 'should return nil if the string was not modified' do input = 'foo' Sanitize.clean!(input).must_equal(nil) end end describe 'Sanitize.clean_document' do before { @config = { :elements => ['html', 'p'] } } it 'should be idempotent' do input = '

foo

' first = Sanitize.clean_document(input, @config) second = Sanitize.clean_document(first, @config) second.must_equal first second.wont_be_nil end it 'should handle nil without raising' do Sanitize.clean_document(nil).must_equal nil end it 'should not modify the input string' do input = 'foo' Sanitize.clean_document(input, @config) input.must_equal('foo') end it 'should return a new string' do input = 'foo' Sanitize.clean_document(input, @config).must_equal("\nfoo\n") end end describe 'Sanitize.clean_document!' do before { @config = { :elements => ['html'] } } it 'should modify the input string' do input = 'foo' Sanitize.clean_document!(input, @config) input.must_equal("\nfoo\n") end it 'should return the string if it was modified' do input = 'foo' Sanitize.clean_document!(input, @config).must_equal("\nfoo\n") end it 'should return nil if the string was not modified' do input = "\n\n" Sanitize.clean_document!(input, @config).must_equal(nil) end end describe 'transformers' do # YouTube embed transformer. youtube = lambda do |env| node = env[:node] node_name = env[:node_name] # Don't continue if this node is already whitelisted or is not an element. return if env[:is_whitelisted] || !node.element? # Don't continue unless the node is an iframe. return unless node_name == 'iframe' # Verify that the video URL is actually a valid YouTube video URL. return unless node['src'] =~ /\Ahttps?:\/\/(?:www\.)?youtube(?:-nocookie)?\.com\// # We're now certain that this is a YouTube embed, but we still need to run # it through a special Sanitize step to ensure that no unwanted elements or # attributes that don't belong in a YouTube embed can sneak in. Sanitize.clean_node!(node, { :elements => %w[iframe], :attributes => { 'iframe' => %w[allowfullscreen frameborder height src width] } }) # Now that we're sure that this is a valid YouTube embed and that there are # no unwanted elements or attributes hidden inside it, we can tell Sanitize # to whitelist the current node. {:node_whitelist => [node]} end it 'should receive a complete env Hash as input' do Sanitize.clean!('foo', :foo => :bar, :transformers => lambda {|env| return unless env[:node].element? env[:config][:foo].must_equal(:bar) env[:is_whitelisted].must_equal(false) env[:node].must_be_kind_of(Nokogiri::XML::Node) env[:node_name].must_equal('span') env[:node_whitelist].must_be_kind_of(Set) env[:node_whitelist].must_be_empty }) end it 'should traverse all node types, including the fragment itself' do nodes = [] Sanitize.clean!('

foo

', :transformers => proc {|env| nodes << env[:node_name] }) nodes.must_equal(%w[ text div comment #cdata-section script #document-fragment ]) end it 'should traverse in depth-first mode by default' do nodes = [] Sanitize.clean!('

foo

bar

', :transformers => proc {|env| env[:traversal_mode].must_equal(:depth) nodes << env[:node_name] if env[:node].element? }) nodes.must_equal(['span', 'div', 'p']) end it 'should traverse in breadth-first mode when using :transformers_breadth' do nodes = [] Sanitize.clean!('

foo

bar

', :transformers_breadth => proc {|env| env[:traversal_mode].must_equal(:breadth) nodes << env[:node_name] if env[:node].element? }) nodes.must_equal(['div', 'span', 'p']) end it 'should whitelist nodes in the node whitelist' do Sanitize.clean!('

foo

bar', :transformers => [ proc {|env| {:node_whitelist => [env[:node]]} if env[:node_name] == 'div' }, proc {|env| env[:is_whitelisted].must_equal(false) unless env[:node_name] == 'div' env[:is_whitelisted].must_equal(true) if env[:node_name] == 'div' env[:node_whitelist].must_include(env[:node]) if env[:node_name] == 'div' } ]).must_equal('

foo

bar') end it 'should clear the node whitelist after each fragment' do called = false Sanitize.clean!('

foo

', :transformers => proc {|env| {:node_whitelist => [env[:node]]} }) Sanitize.clean!('

foo

', :transformers => proc {|env| called = true env[:is_whitelisted].must_equal(false) env[:node_whitelist].must_be_empty }) called.must_equal(true) end it 'should allow youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should allow https youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should allow privacy-enhanced youtube video embeds via the youtube transformer' do input = '' output = Nokogiri::HTML::DocumentFragment.parse('').to_html(:encoding => 'utf-8', :indent => 0) Sanitize.clean!(input, :transformers => youtube).must_equal(output) end it 'should not allow non-youtube video embeds via the youtube transformer' do input = '' output = '' Sanitize.clean!(input, :transformers => youtube).must_equal(output) end end describe 'bugs' do it 'should not have Nokogiri 1.4.2+ unterminated script/style element bug' do Sanitize.clean!('foo