module Loofah # # Loofah provides some built-in scrubbers for sanitizing with # HTML5lib's whitelist and for accomplishing some common # transformation tasks. # # # === Loofah::Scrubbers::Strip / scrub!(:strip) # # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents: # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:strip) # => "ohai!
div is safe
but foo is not" # # # === Loofah::Scrubbers::Prune / scrub!(:prune) # # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees): # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:prune) # => "ohai!
div is safe
" # # # === Loofah::Scrubbers::Escape / scrub!(:escape) # # +:escape+ performs HTML entity escaping on the unknown/unsafe tags: # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:escape) # => "ohai!
div is safe
<foo>but foo is <b>not</b></foo>" # # # === Loofah::Scrubbers::Whitewash / scrub!(:whitewash) # # +:whitewash+ removes all comments, styling and attributes in # addition to doing markup-fixer-uppery and pruning unsafe tags. I # like to call this "whitewashing", since it's like putting a new # layer of paint on top of the HTML input to make it look nice. # # messy_markup = "ohai!
div with attributes
" # Loofah.fragment(messy_markup).scrub!(:whitewash) # => "ohai!
div with attributes
" # # One use case for this scrubber is to clean up HTML that was # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a # rich text editor. Microsoft's software is famous for injecting # all kinds of cruft into its HTML output. Who needs that crap? # Certainly not me. # # # === Loofah::Scrubbers::NoFollow / scrub!(:nofollow) # # +:nofollow+ adds a rel="nofollow" attribute to all links # # link_farmers_markup = "ohai! I like your blog post" # Loofah.fragment(link_farmers_markup).scrub!(:nofollow) # => "ohai! I like your blog post" # module Scrubbers # # === scrub!(:strip) # # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents: # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:strip) # => "ohai!
div is safe
but foo is not" # class Strip < Scrubber def initialize @direction = :bottom_up end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE node.before node.inner_html node.remove end end # # === scrub!(:prune) # # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees): # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:prune) # => "ohai!
div is safe
" # class Prune < Scrubber def initialize @direction = :top_down end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE node.remove return STOP end end # # === scrub!(:escape) # # +:escape+ performs HTML entity escaping on the unknown/unsafe tags: # # unsafe_html = "ohai!
div is safe
but foo is not" # Loofah.fragment(unsafe_html).scrub!(:escape) # => "ohai!
div is safe
<foo>but foo is <b>not</b></foo>" # class Escape < Scrubber def initialize @direction = :top_down end def scrub(node) return CONTINUE if html5lib_sanitize(node) == CONTINUE replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document) node.add_next_sibling replacement_killer node.remove return STOP end end # # === scrub!(:whitewash) # # +:whitewash+ removes all comments, styling and attributes in # addition to doing markup-fixer-uppery and pruning unsafe tags. I # like to call this "whitewashing", since it's like putting a new # layer of paint on top of the HTML input to make it look nice. # # messy_markup = "ohai!
div with attributes
" # Loofah.fragment(messy_markup).scrub!(:whitewash) # => "ohai!
div with attributes
" # # One use case for this scrubber is to clean up HTML that was # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a # rich text editor. Microsoft's software is famous for injecting # all kinds of cruft into its HTML output. Who needs that crap? # Certainly not me. # class Whitewash < Scrubber def initialize @direction = :top_down end def scrub(node) case node.type when Nokogiri::XML::Node::ELEMENT_NODE if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name] node.attributes.each { |attr| node.remove_attribute(attr.first) } return CONTINUE if node.namespaces.empty? end when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE return CONTINUE end node.remove STOP end end # # === scrub!(:nofollow) # # +:nofollow+ adds a rel="nofollow" attribute to all links # # link_farmers_markup = "ohai! I like your blog post" # Loofah.fragment(link_farmers_markup).scrub!(:nofollow) # => "ohai! I like your blog post" # class NoFollow < Scrubber def initialize @direction = :top_down end def scrub(node) return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a') node.set_attribute('rel', 'nofollow') return STOP end end # This class probably isn't useful publicly, but is used for #to_text's current implemention class NewlineBlockElements < Scrubber # :nodoc: def initialize @direction = :bottom_up end def scrub(node) return CONTINUE unless Loofah::HashedElements::BLOCK_LEVEL[node.name] replacement_killer = Nokogiri::XML::Text.new("\n#{node.content}\n", node.document) node.add_next_sibling replacement_killer node.remove end end # # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune). # MAP = { :escape => Escape, :prune => Prune, :whitewash => Whitewash, :strip => Strip, :nofollow => NoFollow, :newline_block_elements => NewlineBlockElements } # # Returns an array of symbols representing the built-in scrubbers # def self.scrubber_symbols MAP.keys end end end