lib/dryopteris/sanitize.rb in mdalessio-dryopteris-0.1.0 vs lib/dryopteris/sanitize.rb in mdalessio-dryopteris-0.1.1

- old
+ new

@@ -16,10 +16,23 @@ body_element = doc.at("/html/body") return "" if body_element.nil? body_element.inner_text end + def whitewash(string_or_io, encoding=nil) + return nil if string_or_io.nil? + return "" if string_or_io.strip.size == 0 + + doc = Nokogiri::HTML.parse(string_or_io, nil, encoding) + body = doc.xpath("/html/body").first + return "" if body.nil? + body.children.each do |node| + traverse_conditionally_top_down(node, :whitewash_node) + end + body.children.map { |x| x.to_xml }.join + end + def sanitize(string, encoding=nil) return nil if string.nil? return "" if string.strip.size == 0 string = "<html><body>" + string + "</body></html>" @@ -44,10 +57,11 @@ end doc.root.to_xml end private + def traverse_conditionally_top_down(node, method_name) return if send(method_name, node) node.children.each {|j| traverse_conditionally_top_down(j, method_name)} end @@ -84,9 +98,35 @@ when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE return false end replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document) node.add_next_sibling(replacement_killer) + node.remove + return true + end + + + def whitewash_node(node) + case node.type + when 1 # Nokogiri::XML::Node::ELEMENT_NODE + if HashedWhiteList::ALLOWED_ELEMENTS[node.name] + node.attributes.each { |attr| node.remove_attribute(attr.first) } + has_no_namespaces = true + begin + has_no_namespaces = node.namespaces.empty? + rescue + # older versions of nokogiri raise an exception when there + # is a namespace on the node that is not declared with an href. + # see http://github.com/tenderlove/nokogiri/commit/395d7971304e1489e92c494b9c50609f4b4c4ab0 + has_no_namespaces = false + end + return false if has_no_namespaces + end + when 3 # Nokogiri::XML::Node::TEXT_NODE + return false + when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE + return false + end node.remove return true end