# frozen_string_literal: true
require "minitest/autorun"
require "rails-html-sanitizer"
puts "nokogiri version info: #{Nokogiri::VERSION_INFO}"
puts "html5 support: #{Rails::HTML::Sanitizer.html5_support?}"
#
# NOTE that many of these tests contain multiple acceptable results.
#
# In some cases, this is because of how the HTML4 parser's recovery behavior changed in libxml2
# 2.9.14 and 2.10.0. For more details, see:
#
# - https://github.com/sparklemotion/nokogiri/releases/tag/v1.13.5
# - https://gitlab.gnome.org/GNOME/libxml2/-/issues/380
#
# In other cases, multiple acceptable results are provided because Nokogiri's vendored libxml2 is
# patched to entity-escape server-side includes (aks "SSI", aka ``).
#
# In many other cases, it's because the parser used by Nokogiri on JRuby (xerces+nekohtml) parses
# slightly differently than libxml2 in edge cases.
#
module SanitizerTests
def self.loofah_html5_support?
Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
end
class BaseSanitizerTest < Minitest::Test
class XpathRemovalTestSanitizer < Rails::HTML::Sanitizer
def sanitize(html, options = {})
fragment = Loofah.fragment(html)
remove_xpaths(fragment, options[:xpaths]).to_s
end
end
def test_sanitizer_sanitize_raises_not_implemented_error
assert_raises NotImplementedError do
Rails::HTML::Sanitizer.new.sanitize("asdf")
end
end
def test_remove_xpaths_removes_an_xpath
html = %(
hello
)
assert_equal %(hello
), xpath_sanitize(html, xpaths: %w(.//script))
end
def test_remove_xpaths_removes_all_occurrences_of_xpath
html = %()
assert_equal %(), xpath_sanitize(html, xpaths: %w(.//script))
end
def test_remove_xpaths_called_with_faulty_xpath
assert_raises Nokogiri::XML::XPath::SyntaxError do
xpath_sanitize("hello", xpaths: %w(..faulty_xpath))
end
end
def test_remove_xpaths_called_with_xpath_string
assert_equal "", xpath_sanitize("", xpaths: ".//a")
end
def test_remove_xpaths_called_with_enumerable_xpaths
assert_equal "", xpath_sanitize("", xpaths: %w(.//a .//span))
end
protected
def xpath_sanitize(input, options = {})
XpathRemovalTestSanitizer.new.sanitize(input, options)
end
end
module ModuleUnderTest
def module_under_test
self.class.instance_variable_get(:@module_under_test)
end
end
module FullSanitizerTest
include ModuleUnderTest
def test_strip_tags_with_quote
input = '<"
hi'
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{<" hi},
# other libxml2
%{ hi},
]
assert_includes(acceptable_results, result)
end
def test_strip_invalid_html
assert_equal "<<", full_sanitize("<<This is a test.
\n\n\n\n
It no longer contains any HTML.
\n}
assert_equal expected, full_sanitize(input)
end
def test_remove_unclosed_tags
input = "This is <-- not\n a comment here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 >= 2.9.14 and xerces+neko
%{This is <-- not\n a comment here.},
# other libxml2
%{This is },
]
assert_includes(acceptable_results, result)
end
def test_strip_cdata
input = "This has a ]]> here."
result = full_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has a <![CDATA[]]> here.},
# other libxml2
%{This has a ]]> here.},
# xerces+neko
%{This has a here.},
]
assert_includes(acceptable_results, result)
end
def test_strip_blank_string
assert_nil full_sanitize(nil)
assert_equal "", full_sanitize("")
assert_equal " ", full_sanitize(" ")
end
def test_strip_tags_with_plaintext
assert_equal "Don't touch me", full_sanitize("Don't touch me")
end
def test_strip_tags_with_tags
assert_equal "This is a test.", full_sanitize("This is a test.
")
end
def test_escape_tags_with_many_open_quotes
assert_equal "<<", full_sanitize("<<")
end
def test_strip_tags_with_sentence
assert_equal "This is a test.", full_sanitize("This is a test.")
end
def test_strip_tags_with_comment
assert_equal "This has a here.", full_sanitize("This has a here.")
end
def test_strip_tags_with_frozen_string
assert_equal "Frozen string with no tags", full_sanitize("Frozen string with no tags")
end
def test_full_sanitize_respect_html_escaping_of_the_given_string
assert_equal 'test\r\nstring', full_sanitize('test\r\nstring')
assert_equal "&", full_sanitize("&")
assert_equal "&", full_sanitize("&")
assert_equal "&", full_sanitize("&")
assert_equal "omg <script>BOM</script>", full_sanitize("omg <script>BOM</script>")
end
def test_sanitize_ascii_8bit_string
full_sanitize("".encode("ASCII-8BIT")).tap do |sanitized|
assert_equal "hello", sanitized
assert_equal Encoding::UTF_8, sanitized.encoding
end
end
protected
def full_sanitize(input, options = {})
module_under_test::FullSanitizer.new.sanitize(input, options)
end
end
class HTML4FullSanitizerTest < Minitest::Test
@module_under_test = Rails::HTML4
include FullSanitizerTest
end
class HTML5FullSanitizerTest < Minitest::Test
@module_under_test = Rails::HTML5
include FullSanitizerTest
end if loofah_html5_support?
module LinkSanitizerTest
include ModuleUnderTest
def test_strip_links_with_tags_in_tags
expected = "<a href='hello'>all day long</a>"
input = "<a href='hello'>all day long</a>"
assert_equal expected, link_sanitize(input)
end
def test_strip_links_with_unclosed_tags
assert_equal "", link_sanitize("on my mind\nall day long")
end
def test_strip_links_leaves_nonlink_tags
assert_equal "My mind\nall day long", link_sanitize("My mind\nall day long")
end
def test_strip_links_with_links
assert_equal "0wn3d", link_sanitize("0wn3d")
end
def test_strip_links_with_linkception
assert_equal "Magic", link_sanitize("Magic")
end
def test_sanitize_ascii_8bit_string
link_sanitize("".encode("ASCII-8BIT")).tap do |sanitized|
assert_equal "hello
", sanitized
assert_equal Encoding::UTF_8, sanitized.encoding
end
end
protected
def link_sanitize(input, options = {})
module_under_test::LinkSanitizer.new.sanitize(input, options)
end
end
class HTML4LinkSanitizerTest < Minitest::Test
@module_under_test = Rails::HTML4
include LinkSanitizerTest
end
class HTML5LinkSanitizerTest < Minitest::Test
@module_under_test = Rails::HTML5
include LinkSanitizerTest
end if loofah_html5_support?
module SafeListSanitizerTest
include ModuleUnderTest
def test_sanitize_nested_script
assert_equal '<script>alert("XSS");</script>', safe_list_sanitize('alert("XSS");/', tags: %w(em))
end
def test_sanitize_nested_script_in_style
input = 'alert("XSS");/'
result = safe_list_sanitize(input, tags: %w(em))
acceptable_results = [
# libxml2
%{<script>alert("XSS");</script>},
# xerces+neko. unavoidable double-escaping, see loofah/docs/2022-10-decision-on-cdata-nodes.md
%{<script>alert(\"XSS\");<</style>/script>},
]
assert_includes(acceptable_results, result)
end
def test_strip_unclosed_cdata
input = "This has an unclosed ]] here..."
result = safe_list_sanitize(input)
acceptable_results = [
# libxml2 = 2.9.14
%{This has an unclosed <![CDATA[]] here...},
# other libxml2
%{This has an unclosed ]] here...},
# xerces+neko
%{This has an unclosed }
]
assert_includes(acceptable_results, result)
end
def test_sanitize_form
assert_sanitized "", ""
end
def test_sanitize_plaintext
# note that the `plaintext` tag has been deprecated since HTML 2
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/plaintext
input = "foo"
result = safe_list_sanitize(input)
acceptable_results = [
# libxml2
"foo",
# xerces+nekohtml-unit
"<span>foo</span></plaintext>",
# xerces+cyberneko
"<span>foo</span>"
]
assert_includes(acceptable_results, result)
end
def test_sanitize_script
assert_sanitized "a b cd e f", "a b cblah blah blahd e f"
end
def test_sanitize_js_handlers
raw = %{onthis="do that" hello}
assert_sanitized raw, %{onthis="do that" hello}
end
def test_sanitize_javascript_href
raw = %{href="javascript:bang" foo, bar}
assert_sanitized raw, %{href="javascript:bang" foo, bar}
end
def test_sanitize_image_src
raw = %{src="javascript:bang"
foo, bar}
assert_sanitized raw, %{src="javascript:bang"
foo, bar}
end
def test_should_allow_anchors
assert_sanitized %(), %(baz)
end
def test_video_poster_sanitization
scope_allowed_tags(%w(video)) do
scope_allowed_attributes %w(src poster) do
expected = if RUBY_PLATFORM == "java"
# xerces+nekohtml alphabetizes the attributes! FML.
%()
else
%()
end
assert_sanitized(
%(),
expected,
)
assert_sanitized(
%(),
%(),
)
end
end
end
# RFC 3986, sec 4.2
def test_allow_colons_in_path_component
assert_sanitized "foo"
end
%w(src width height alt).each do |img_attr|
define_method "test_should_allow_image_#{img_attr}_attribute" do
assert_sanitized %(
), %(
)
end
end
def test_lang_and_xml_lang
# https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes
#
# 3.2.6.2 The lang and xml:lang attributes
#
# ... Authors must not use the lang attribute in the XML namespace on HTML elements in HTML
# documents. To ease migration to and from XML, authors may specify an attribute in no namespace
# with no prefix and with the literal localname "xml:lang" on HTML elements in HTML documents,
# but such attributes must only be specified if a lang attribute in no namespace is also
# specified, and both attributes must have the same value when compared in an ASCII
# case-insensitive manner.
input = expected = "foo
"
assert_sanitized(input, expected)
end
def test_should_handle_non_html
assert_sanitized "abc"
end
def test_should_handle_blank_text
assert_nil(safe_list_sanitize(nil))
assert_equal("", safe_list_sanitize(""))
assert_equal(" ", safe_list_sanitize(" "))
end
def test_setting_allowed_tags_affects_sanitization
scope_allowed_tags %w(u) do |sanitizer|
assert_equal "", sanitizer.sanitize("")
end
end
def test_setting_allowed_attributes_affects_sanitization
scope_allowed_attributes %w(foo) do |sanitizer|
input = ''
assert_equal '', sanitizer.sanitize(input)
end
end
def test_custom_tags_overrides_allowed_tags
scope_allowed_tags %(u) do |sanitizer|
input = ""
assert_equal "", sanitizer.sanitize(input, tags: %w(a))
end
end
def test_custom_attributes_overrides_allowed_attributes
scope_allowed_attributes %(foo) do |sanitizer|
input = ''
assert_equal '', sanitizer.sanitize(input, attributes: %w(bar))
end
end
def test_should_allow_prune
sanitizer = module_under_test::SafeListSanitizer.new(prune: true)
text = "leave me now"
assert_equal "leave me ", sanitizer.sanitize(text, tags: %w(u))
end
def test_should_allow_custom_tags
text = "foo"
assert_equal text, safe_list_sanitize(text, tags: %w(u))
end
def test_should_allow_only_custom_tags
text = "foo with bar"
assert_equal "foo with bar", safe_list_sanitize(text, tags: %w(u))
end
def test_should_allow_custom_tags_with_attributes
text = %(foo
)
assert_equal text, safe_list_sanitize(text)
end
def test_should_allow_custom_tags_with_custom_attributes
text = %(Lorem ipsum
)
assert_equal text, safe_list_sanitize(text, attributes: ["foo"])
end
def test_scrub_style_if_style_attribute_option_is_passed
input = ''
actual = safe_list_sanitize(input, attributes: %w(style))
assert_includes(['', ''], actual)
end
def test_should_raise_argument_error_if_tags_is_not_enumerable
assert_raises ArgumentError do
safe_list_sanitize("some html", tags: "foo")
end
end
def test_should_raise_argument_error_if_attributes_is_not_enumerable
assert_raises ArgumentError do
safe_list_sanitize("some html", attributes: "foo")
end
end
def test_should_not_accept_non_loofah_inheriting_scrubber
scrubber = Object.new
def scrubber.scrub(node); node.name = "h1"; end
assert_raises Loofah::ScrubberNotFound do
safe_list_sanitize("some html", scrubber: scrubber)
end
end
def test_should_accept_loofah_inheriting_scrubber
scrubber = Loofah::Scrubber.new
def scrubber.scrub(node); node.replace("#{node.inner_html}
"); end
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber)
end
def test_should_accept_loofah_scrubber_that_wraps_a_block
scrubber = Loofah::Scrubber.new { |node| node.replace("#{node.inner_html}
") }
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber)
end
def test_custom_scrubber_takes_precedence_over_other_options
scrubber = Loofah::Scrubber.new { |node| node.replace("#{node.inner_html}
") }
html = ""
assert_equal "hello!
", safe_list_sanitize(html, scrubber: scrubber, tags: ["foo"])
end
def test_should_strip_src_attribute_in_img_with_bad_protocols
assert_sanitized %(
), %(
)
end
def test_should_strip_href_attribute_in_a_with_bad_protocols
assert_sanitized %(boo), %(boo)
end
def test_should_block_script_tag
assert_sanitized %(), ""
end
def test_should_not_fall_for_xss_image_hack_with_uppercase_tags
assert_sanitized %(
">), %(
alert("XSS")">)
end
[%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
),
%(
)].each do |img_hack|
define_method "test_should_not_fall_for_xss_image_hack_#{img_hack}" do
assert_sanitized img_hack, "
"
end
end
def test_should_sanitize_tag_broken_up_by_null
input = %(alert(\"XSS\"))
result = safe_list_sanitize(input)
acceptable_results = [
# libxml2
"",
# xerces+neko
'alert("XSS")',
]
assert_includes(acceptable_results, result)
end
def test_should_sanitize_invalid_script_tag
assert_sanitized %(), ""
end
def test_should_sanitize_script_tag_with_multiple_open_brackets
assert_sanitized %(<), "<alert(\"XSS\");//<"
end
def test_should_sanitize_script_tag_with_multiple_open_brackets_2
input = %(