# frozen_string_literal: true module Loofah # # Loofah provides some built-in scrubbers for sanitizing with # HTML5lib's safelist and for accomplishing some common # transformation tasks. # # # === Loofah::Scrubbers::Strip / scrub!(:strip) # # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents: # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:strip) # => "ohai!

div is safe

but foo is not" # # # === Loofah::Scrubbers::Prune / scrub!(:prune) # # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees): # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:prune) # => "ohai!

div is safe

" # # # === Loofah::Scrubbers::Escape / scrub!(:escape) # # +:escape+ performs HTML entity escaping on the unknown/unsafe tags: # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:escape) # => "ohai!

div is safe

<foo>but foo is <b>not</b></foo>" # # # === Loofah::Scrubbers::Whitewash / scrub!(:whitewash) # # +:whitewash+ removes all comments, styling and attributes in # addition to doing markup-fixer-uppery and pruning unsafe tags. I # like to call this "whitewashing", since it's like putting a new # layer of paint on top of the HTML input to make it look nice. # # messy_markup = "ohai!

div with attributes

" # Loofah.html5_fragment(messy_markup).scrub!(:whitewash) # => "ohai!

div with attributes

Some text with an unprintable character at the end\u2028

" # Loofah.html5_fragment(markup).scrub!(:unprintable) # => "

Some text with an unprintable character at the end

" # # You may not be able to see the unprintable character in the above example, but there is a # U+2028 character right before the closing

tag. These characters can cause issues if # the content is ever parsed by JavaScript - more information here: # # http://timelessrepo.com/json-isnt-a-javascript-subset # module Scrubbers # # === scrub!(:strip) # # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents: # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:strip) # => "ohai!

div is safe

but foo is not" # class Strip < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :bottom_up end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE node.before(node.children) node.remove STOP end end # # === scrub!(:prune) # # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees): # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:prune) # => "ohai!

div is safe

" # class Prune < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE node.remove STOP end end # # === scrub!(:escape) # # +:escape+ performs HTML entity escaping on the unknown/unsafe tags: # # unsafe_html = "ohai!

div is safe

but foo is not" # Loofah.html5_fragment(unsafe_html).scrub!(:escape) # => "ohai!

div is safe

<foo>but foo is <b>not</b></foo>" # class Escape < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document)) node.remove STOP end end # # === scrub!(:whitewash) # # +:whitewash+ removes all comments, styling and attributes in # addition to doing markup-fixer-uppery and pruning unsafe tags. I # like to call this "whitewashing", since it's like putting a new # layer of paint on top of the HTML input to make it look nice. # # messy_markup = "ohai!

div with attributes

" # Loofah.html5_fragment(messy_markup).scrub!(:whitewash) # => "ohai!

div with attributes

" # # One use case for this scrubber is to clean up HTML that was # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a # rich text editor. Microsoft's software is famous for injecting # all kinds of cruft into its HTML output. Who needs that crap? # Certainly not me. # class Whitewash < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) case node.type when Nokogiri::XML::Node::ELEMENT_NODE if HTML5::Scrub.allowed_element?(node.name) node.attributes.each { |attr| node.remove_attribute(attr.first) } return CONTINUE if node.namespaces.empty? end when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE return CONTINUE end node.remove STOP end end # # === scrub!(:nofollow) # # +:nofollow+ adds a rel="nofollow" attribute to all links # # link_farmers_markup = "ohai! I like your blog post" # Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow) # => "ohai! I like your blog post" # class NoFollow < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a") append_attribute(node, "rel", "nofollow") STOP end end # # === scrub!(:targetblank) # # +:targetblank+ adds a target="_blank" attribute to all links. # If there is a target already set, replaces it with target="_blank". # # link_farmers_markup = "ohai! I like your blog post" # Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank) # => "ohai! I like your blog post" # # On modern browsers, setting target="_blank" on anchor elements implicitly provides the same # behavior as setting rel="noopener". # class TargetBlank < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a") node.set_attribute("target", "_blank") STOP end end # # === scrub!(:noopener) # # +:noopener+ adds a rel="noopener" attribute to all links # # link_farmers_markup = "ohai! I like your blog post" # Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener) # => "ohai! I like your blog post" # class NoOpener < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a") append_attribute(node, "rel", "noopener") STOP end end # # === scrub!(:noreferrer) # # +:noreferrer+ adds a rel="noreferrer" attribute to all links # # link_farmers_markup = "ohai! I like your blog post" # Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer) # => "ohai! I like your blog post" # class NoReferrer < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a") append_attribute(node, "rel", "noreferrer") STOP end end # This class probably isn't useful publicly, but is used for #to_text's current implemention class NewlineBlockElements < Scrubber # :nodoc: def initialize # rubocop:disable Lint/MissingSuper @direction = :bottom_up end def scrub(node) return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name) replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name) "\n" else "\n#{node.content}\n" end node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document)) node.remove end end # # === scrub!(:unprintable) # # +:unprintable+ removes unprintable Unicode characters. # # markup = "

Some text with an unprintable character at the end\u2028

" # Loofah.html5_fragment(markup).scrub!(:unprintable) # => "

Some text with an unprintable character at the end

" # # You may not be able to see the unprintable character in the above example, but there is a # U+2028 character right before the closing

tag. These characters can cause issues if # the content is ever parsed by JavaScript - more information here: # # http://timelessrepo.com/json-isnt-a-javascript-subset # class Unprintable < Scrubber def initialize # rubocop:disable Lint/MissingSuper @direction = :top_down end def scrub(node) if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE node.content = node.content.gsub(/\u2028|\u2029/, "") end CONTINUE end end # # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune). # MAP = { escape: Escape, prune: Prune, whitewash: Whitewash, strip: Strip, nofollow: NoFollow, noopener: NoOpener, noreferrer: NoReferrer, targetblank: TargetBlank, newline_block_elements: NewlineBlockElements, unprintable: Unprintable, } class << self # # Returns an array of symbols representing the built-in scrubbers # def scrubber_symbols MAP.keys end end end end