# frozen_string_literal: true module Loofah # # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet. # # Traverse the document or fragment, invoking the +scrubber+ on each node. # # +scrubber+ must either be one of the symbols representing the built-in scrubbers (see # Scrubbers), or a Scrubber instance. # # span2div = Loofah::Scrubber.new do |node| # node.name = "div" if node.name == "span" # end # Loofah.html5_fragment("foo

bar

").scrub!(span2div).to_s # # => "
foo

bar

" # # or # # unsafe_html = "ohai!
div is safe
" # Loofah.html5_fragment(unsafe_html).scrub!(:strip).to_s # # => "ohai!
div is safe
" # # Note that this method is called implicitly from the shortcuts Loofah.scrub_html5_fragment et # al. # # Please see Scrubber for more information on implementation and traversal, and README.rdoc for # more example usage. # module ScrubBehavior module Node # :nodoc: def scrub!(scrubber) # # yes. this should be three separate methods. but nokogiri decorates (or not) based on # whether the module name has already been included. and since documents get decorated just # like their constituent nodes, we need to jam all the logic into a single module. # scrubber = ScrubBehavior.resolve_scrubber(scrubber) case self when Nokogiri::XML::Document scrubber.traverse(root) if root when Nokogiri::XML::DocumentFragment children.scrub!(scrubber) else scrubber.traverse(self) end self end end module NodeSet # :nodoc: def scrub!(scrubber) each { |node| node.scrub!(scrubber) } self end end class << self def resolve_scrubber(scrubber) # :nodoc: scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber] unless scrubber.is_a?(Loofah::Scrubber) raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}" end scrubber end end end # # Overrides +text+ in Document and DocumentFragment classes, and mixes in +to_text+. # module TextBehavior # # Returns a plain-text version of the markup contained by the document, with HTML entities # encoded. # # This method is significantly faster than #to_text, but isn't clever about whitespace around # block elements. # # Loofah.html5_document("

Title

Content
").text # # => "TitleContent" # # By default, the returned text will have HTML entities escaped. If you want unescaped # entities, and you understand that the result is unsafe to render in a browser, then you can # pass an argument as shown: # # frag = Loofah.html5_fragment("<script>alert('EVIL');</script>") # # ok for browser: # frag.text # => "<script>alert('EVIL');</script>" # # decidedly not ok for browser: # frag.text(:encode_special_chars => false) # => "" # def text(options = {}) result = if serialize_root serialize_root.children.reject(&:comment?).map(&:inner_text).join("") else "" end if options[:encode_special_chars] == false result # possibly dangerous if rendered in a browser else encode_special_chars(result) end end alias_method :inner_text, :text alias_method :to_str, :text # # Returns a plain-text version of the markup contained by the fragment, with HTML entities # encoded. # # This method is slower than #text, but is clever about whitespace around block elements and # line break elements. # # Loofah.html5_document("

Title

Content
Next line
").to_text # # => "\nTitle\n\nContent\nNext line\n" # def to_text(options = {}) Loofah.remove_extraneous_whitespace(dup.scrub!(:newline_block_elements).text(options)) end end module DocumentDecorator # :nodoc: def initialize(*args, &block) super decorators(Nokogiri::XML::Node) << ScrubBehavior::Node decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet end end module HtmlDocumentBehavior # :nodoc: module ClassMethods def parse(*args, &block) remove_comments_before_html_element(super) end private # remove comments that exist outside of the HTML element. # # these comments are allowed by the HTML spec: # # https://www.w3.org/TR/html401/struct/global.html#h-7.1 # # but are not scrubbed by Loofah because these nodes don't meet # the contract that scrubbers expect of a node (e.g., it can be # replaced, sibling and children nodes can be created). def remove_comments_before_html_element(doc) doc.children.each do |child| child.unlink if child.comment? end doc end end class << self def included(base) base.extend(ClassMethods) end end def serialize_root at_xpath("/html/body") end end module HtmlFragmentBehavior # :nodoc: module ClassMethods def parse(tags, encoding = nil) doc = document_klass.new encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : "UTF-8" doc.encoding = encoding new(doc, tags) end def document_klass @document_klass ||= if Loofah.html5_support? && self == Loofah::HTML5::DocumentFragment Loofah::HTML5::Document elsif self == Loofah::HTML4::DocumentFragment Loofah::HTML4::Document else raise ArgumentError, "unexpected class: #{self}" end end end class << self def included(base) base.extend(ClassMethods) end end def to_s serialize_root.children.to_s end alias_method :serialize, :to_s def serialize_root at_xpath("./body") || self end end end