#
# these tests taken from the HTML5 sanitization project and modified for use with Loofah
# see the original here: http://code.google.com/p/html5lib/source/browse/ruby/test/test_sanitizer.rb
#
# license text at the bottom of this file
#
require File.expand_path(File.join(File.dirname(__FILE__), '..', 'helper'))
require 'json'
class Html5TestSanitizer < Test::Unit::TestCase
include Loofah
def sanitize_xhtml stream
Loofah.fragment(stream).scrub!(:escape).to_xhtml
end
def sanitize_html stream
Loofah.fragment(stream).scrub!(:escape).to_html
end
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
## libxml uses double-quotes, so let's swappo-boppo our quotes before comparing.
sane = sanitize_html(input).gsub('"',"'")
## HTML5's parsers are shit. there's so much inconsistency with what has closing tags, etc, that
## it would require a lot of manual hacking to make the tests match libxml's output.
## instead, I'm taking the shotgun approach, and trying to match any of the described outputs.
assert((htmloutput == sane) || (rexmloutput == sane) || (xhtmloutput == sane), input)
end
(HTML5::WhiteList::ALLOWED_ELEMENTS).each do |tag_name|
define_method "test_should_allow_#{tag_name}_tag" do
input = "<#{tag_name} title='1'>foo bar baz#{tag_name}>"
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz#{tag_name}>"
rexmloutput = xhtmloutput
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
htmloutput = "foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
elsif tag_name == 'col'
htmloutput = "
foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = ""
elsif tag_name == 'table'
htmloutput = "foo <bad>bar</bad>baz"
xhtmloutput = htmloutput
elsif tag_name == 'image'
htmloutput = "foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
rexmloutput = "foo <bad>bar</bad> baz"
elsif HTML5::WhiteList::VOID_ELEMENTS.include?(tag_name)
htmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz"
xhtmloutput = htmloutput
htmloutput += '
' if tag_name == 'br'
rexmloutput = "<#{tag_name} title='1' />"
end
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
end
end
##
## libxml2 downcases elements, so this is moot.
##
# HTML5::WhiteList::ALLOWED_ELEMENTS.each do |tag_name|
# define_method "test_should_forbid_#{tag_name.upcase}_tag" do
# input = "<#{tag_name.upcase} title='1'>foo bar baz#{tag_name.upcase}>"
# output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
# check_sanitization(input, output, output, output)
# end
# end
HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
next if attribute_name == 'style'
define_method "test_should_allow_#{attribute_name}_attribute" do
input = "foo bar baz
"
if %w[checked compact disabled ismap multiple nohref noshade nowrap readonly selected].include?(attribute_name)
output = "foo <bad>bar</bad> baz
"
htmloutput = "foo <bad>bar</bad> baz
"
else
output = "foo <bad>bar</bad> baz
"
htmloutput = "foo <bad>bar</bad> baz
"
end
check_sanitization(input, htmloutput, output, output)
end
end
##
## libxml2 downcases attributes, so this is moot.
##
# HTML5::WhiteList::ALLOWED_ATTRIBUTES.each do |attribute_name|
# define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
# input = "foo bar baz
"
# output = "foo <bad>bar</bad> baz
"
# check_sanitization(input, output, output, output)
# end
# end
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_#{protocol}_uris" do
input = %(foo)
output = "foo"
check_sanitization(input, output, output, output)
end
end
HTML5::WhiteList::ALLOWED_PROTOCOLS.each do |protocol|
define_method "test_should_allow_uppercase_#{protocol}_uris" do
input = %(foo)
output = "foo"
check_sanitization(input, output, output, output)
end
end
HTML5::WhiteList::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
next unless HTML5::WhiteList::ALLOWED_ELEMENTS.include?(tag_name)
define_method "test_#{tag_name}_should_allow_local_href" do
input = %(<#{tag_name} xlink:href="#foo"/>)
output = "<#{tag_name.downcase} xlink:href='#foo'>#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} xlink:href='#foo'>#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
input = %(<#{tag_name} xlink:href="\n#foo"/>)
output = "<#{tag_name.downcase} xlink:href='\n#foo'>#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'>#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
output = "<#{tag_name.downcase}>#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name}>#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
output = "<#{tag_name.downcase}>#{tag_name.downcase}>"
xhtmloutput = "<#{tag_name}>#{tag_name}>"
check_sanitization(input, output, xhtmloutput, xhtmloutput)
end
end
##
## as tenderlove says, "care < 0"
##
# def test_should_handle_astral_plane_characters
# input = "𝒵 𝔸
"
# output = "\360\235\222\265 \360\235\224\270
"
# check_sanitization(input, output, output, output)
# input = "\360\235\224\270 a
"
# output = "\360\235\224\270 a
"
# check_sanitization(input, output, output, output)
# end
# This affects only NS4. Is it worth fixing?
# def test_javascript_includes
# input = %(foo
)
# output = "foo
"
# check_sanitization(input, output, output, output)
# end
##
## these tests primarily test the parser logic, not the sanitizer
## logic. i call bullshit. we're not writing a test suite for
## libxml2 here, so let's rely on the unit tests above to take care
## of our valid elements and attributes.
##
# Dir[File.join(File.dirname(__FILE__), 'testdata', '*.*')].each do |filename|
# JSON::parse(open(filename).read).each do |test|
# define_method "test_#{test['name']}" do
# check_sanitization(
# test['input'],
# test['output'],
# test['xhtml'] || test['output'],
# test['rexml'] || test['output']
# )
# end
# end
# end
## added because we don't have any coverage above on SVG_ATTR_VAL_ALLOWS_REF
HTML5::WhiteList::SVG_ATTR_VAL_ALLOWS_REF.each do |attr_name|
define_method "test_should_allow_uri_refs_in_svg_attribute_#{attr_name}" do
input = ""
output = ""
check_sanitization(input, output, output, output)
end
define_method "test_absolute_uri_refs_in_svg_attribute_#{attr_name}" do
input = ""
output = ""
check_sanitization(input, output, output, output)
end
define_method "test_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
input = ""
rexml = ""
end
define_method "test_absolute_uri_ref_with_space_in_svg_attribute_#{attr_name}" do
input = ""
rexml = ""
end
end
end
#
#
# Copyright (c) 2006-2008 The Authors
#
# Contributors:
# James Graham - jg307@cam.ac.uk
# Anne van Kesteren - annevankesteren@gmail.com
# Lachlan Hunt - lachlan.hunt@lachy.id.au
# Matt McDonald - kanashii@kanashii.ca
# Sam Ruby - rubys@intertwingly.net
# Ian Hickson (Google) - ian@hixie.ch
# Thomas Broyer - t.broyer@ltgt.net
# Jacques Distler - distler@golem.ph.utexas.edu
# Henri Sivonen - hsivonen@iki.fi
# The Mozilla Foundation (contributions from Henri Sivonen since 2008)
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#