# frozen_string_literal: true
module Loofah
#
# Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
#
# Traverse the document or fragment, invoking the +scrubber+ on
# each node.
#
# +scrubber+ must either be one of the symbols representing the
# built-in scrubbers (see Scrubbers), or a Scrubber instance.
#
# span2div = Loofah::Scrubber.new do |node|
# node.name = "div" if node.name == "span"
# end
# Loofah.fragment("foo
bar
").scrub!(span2div).to_s
# # => "foo
bar
"
#
# or
#
# unsafe_html = "ohai! div is safe
"
# Loofah.fragment(unsafe_html).scrub!(:strip).to_s
# # => "ohai! div is safe
"
#
# Note that this method is called implicitly from
# Loofah.scrub_fragment and Loofah.scrub_document.
#
# Please see Scrubber for more information on implementation and traversal, and
# README.rdoc for more example usage.
#
module ScrubBehavior
module Node # :nodoc:
def scrub!(scrubber)
#
# yes. this should be three separate methods. but nokogiri
# decorates (or not) based on whether the module name has
# already been included. and since documents get decorated
# just like their constituent nodes, we need to jam all the
# logic into a single module.
#
scrubber = ScrubBehavior.resolve_scrubber(scrubber)
case self
when Nokogiri::XML::Document
scrubber.traverse(root) if root
when Nokogiri::XML::DocumentFragment
children.scrub! scrubber
else
scrubber.traverse(self)
end
self
end
end
module NodeSet # :nodoc:
def scrub!(scrubber)
each { |node| node.scrub!(scrubber) }
self
end
end
def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
unless scrubber.is_a?(Loofah::Scrubber)
raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
end
scrubber
end
end
#
# Overrides +text+ in HTML::Document and HTML::DocumentFragment,
# and mixes in +to_text+.
#
module TextBehavior
#
# Returns a plain-text version of the markup contained by the document,
# with HTML entities encoded.
#
# This method is significantly faster than #to_text, but isn't
# clever about whitespace around block elements.
#
# Loofah.document("Title
Content
").text
# # => "TitleContent"
#
# By default, the returned text will have HTML entities
# escaped. If you want unescaped entities, and you understand
# that the result is unsafe to render in a browser, then you
# can pass an argument as shown:
#
# frag = Loofah.fragment("<script>alert('EVIL');</script>")
# # ok for browser:
# frag.text # => "<script>alert('EVIL');</script>"
# # decidedly not ok for browser:
# frag.text(:encode_special_chars => false) # => ""
#
def text(options={})
result = serialize_root.children.inner_text rescue ""
if options[:encode_special_chars] == false
result # possibly dangerous if rendered in a browser
else
encode_special_chars result
end
end
alias :inner_text :text
alias :to_str :text
#
# Returns a plain-text version of the markup contained by the
# fragment, with HTML entities encoded.
#
# This method is slower than #to_text, but is clever about
# whitespace around block elements.
#
# Loofah.document("Title
Content
").to_text
# # => "\nTitle\n\nContent\n"
#
def to_text(options={})
Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
end
end
module DocumentDecorator # :nodoc:
def initialize(*args, &block)
super
self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
end
end
end