/** * (The MIT License) * * Copyright (c) 2008 - 2012: * * * {Aaron Patterson}[http://tenderlovemaking.com] * * {Mike Dalessio}[http://mike.daless.io] * * {Charles Nutter}[http://blog.headius.com] * * {Sergio Arbeo}[http://www.serabe.com] * * {Patrick Mahoney}[http://polycrystal.org] * * {Yoko Harada}[http://yokolet.blogspot.com] * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * 'Software'), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package nokogiri.internals; import static nokogiri.internals.NokogiriHelpers.canonicalizeWhitespce; import static nokogiri.internals.NokogiriHelpers.encodeJavaString; import static nokogiri.internals.NokogiriHelpers.isNamespace; import static nokogiri.internals.NokogiriHelpers.isWhitespaceText; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Deque; import java.util.Iterator; import java.util.List; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.cyberneko.html.HTMLElements; import org.w3c.dom.Attr; import org.w3c.dom.CDATASection; import org.w3c.dom.Comment; import org.w3c.dom.Document; import org.w3c.dom.DocumentType; import org.w3c.dom.Element; import org.w3c.dom.Entity; import org.w3c.dom.EntityReference; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.Notation; import org.w3c.dom.ProcessingInstruction; import org.w3c.dom.Text; /** * A class for serializing a document. * * @author sergio * @author Patrick Mahoney * @author Yoko Harada */ public class SaveContextVisitor { private StringBuffer buffer; private Stack indentation; private String encoding, indentString; private boolean format, noDecl, noEmpty, noXhtml, asXhtml, asXml, asHtml, asBuilder, htmlDoc, fragment; private boolean canonical, incl_ns, with_comments, subsets, exclusive; private List c14nNodeList; private Deque c14nNamespaceStack; private Deque c14nAttrStack; private List c14nExclusiveInclusivePrefixes = null; /* * U can't touch this. * http://www.youtube.com/watch?v=WJ2ZFVx6A4Q * * Taken from libxml save options. */ public static final int FORMAT = 1; public static final int NO_DECL = 2; public static final int NO_EMPTY = 4; public static final int NO_XHTML = 8; public static final int AS_XHTML = 16; public static final int AS_XML = 32; public static final int AS_HTML = 64; public static final int AS_BUILDER = 128; public static final int CANONICAL = 1; public static final int INCL_NS = 2; public static final int WITH_COMMENTS = 4; public static final int SUBSETS = 8; public static final int EXCLUSIVE = 16; public SaveContextVisitor(int options, String indent, String encoding, boolean htmlDoc, boolean fragment, int canonicalOpts) { buffer = new StringBuffer(); this.encoding = encoding; indentation = new Stack(); indentation.push(""); this.htmlDoc = htmlDoc; this.fragment = fragment; c14nNodeList = new ArrayList(); c14nNamespaceStack = new ArrayDeque(); c14nAttrStack = new ArrayDeque(); format = (options & FORMAT) == FORMAT; noDecl = (options & NO_DECL) == NO_DECL; noEmpty = (options & NO_EMPTY) == NO_EMPTY; noXhtml = (options & NO_XHTML) == NO_XHTML; asXhtml = (options & AS_XHTML) == AS_XHTML; asXml = (options & AS_XML) == AS_XML; asHtml = (options & AS_HTML) == AS_HTML; asBuilder = (options & AS_BUILDER) == AS_BUILDER; canonical = (canonicalOpts & CANONICAL) == CANONICAL; incl_ns = (canonicalOpts & INCL_NS) == INCL_NS; with_comments = (canonicalOpts & WITH_COMMENTS) == WITH_COMMENTS; subsets = (canonicalOpts & SUBSETS) == SUBSETS; if ((format && indent == null) || (format && indent.length() == 0)) indent = " "; // default, two spaces if ((!format && indent != null) && indent.length() > 0) format = true; if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) indent = " "; // default, two spaces indentString = indent; if (!asXml && !asHtml && !asXhtml && !asBuilder) asXml = true; } @Override public String toString() { return (new String(buffer)); } public void setHtmlDoc(boolean htmlDoc) { this.htmlDoc = htmlDoc; } public void setEncoding(String encoding) { this.encoding = encoding; } public List getC14nNodeList() { return c14nNodeList; } public void setC14nExclusiveInclusivePrefixes(List prefixes) { c14nExclusiveInclusivePrefixes = prefixes; } public boolean enter(Node node) { if (node instanceof Document) { return enter((Document)node); } if (node instanceof Element) { return enter((Element)node); } if (node instanceof Attr) { return enter((Attr)node); } if (node instanceof Text) { return enter((Text)node); } if (node instanceof CDATASection) { return enter((CDATASection)node); } if (node instanceof Comment) { return enter((Comment)node); } if (node instanceof DocumentType) { return enter((DocumentType)node); } if (node instanceof Entity) { return enter((Entity)node); } if (node instanceof EntityReference) { return enter((EntityReference)node); } if (node instanceof Notation) { return enter((Notation)node); } if (node instanceof ProcessingInstruction) { return enter((ProcessingInstruction)node); } return false; } public void leave(Node node) { if (node instanceof Document) { leave((Document)node); return; } if (node instanceof Element) { leave((Element)node); return; } if (node instanceof Attr) { leave((Attr)node); return; } if (node instanceof Text) { leave((Text)node); return; } if (node instanceof CDATASection) { leave((CDATASection)node); return; } if (node instanceof Comment) { leave((Comment)node); return; } if (node instanceof DocumentType) { leave((DocumentType)node); return; } if (node instanceof Entity) { leave((Entity)node); return; } if (node instanceof EntityReference) { leave((EntityReference)node); return; } if (node instanceof Notation) { leave((Notation)node); return; } if (node instanceof ProcessingInstruction) { leave((ProcessingInstruction)node); return; } } public boolean enter(String string) { buffer.append(string); return true; } public void leave(String string) { // no-op } public boolean enter(Attr attr) { String name = attr.getName(); buffer.append(name); if (!asHtml || !isHtmlBooleanAttr(name)) { buffer.append("="); buffer.append("\""); String value = replaceCharsetIfNecessary(attr); buffer.append(serializeAttrTextContent(value, htmlDoc)); buffer.append("\""); } return true; } private static Pattern p = Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE); private String replaceCharsetIfNecessary(Attr attr) { String value = attr.getValue(); if (encoding == null) return value; // unable to replace in any case if (!"content".equals(attr.getName().toLowerCase())) return value; // must be content attr if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value; Matcher m = p.matcher(value); if (!m.find()) return value; if (value.contains(encoding)) return value; // no need to replace return value.replace(m.group(), "charset=" + encoding); } public static final String[] HTML_BOOLEAN_ATTRS = { "checked", "compact", "declare", "defer", "disabled", "ismap", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", "selected" }; private boolean isHtmlBooleanAttr(String name) { for (String s : HTML_BOOLEAN_ATTRS) { if (s.equals(name)) return true; } return false; } private String serializeAttrTextContent(String s, boolean htmlDoc) { if (s == null) return ""; char[] c = s.toCharArray(); StringBuffer buffer = new StringBuffer(c.length); for(int i = 0; i < c.length; i++) { switch(c[i]){ case '\n': buffer.append(" "); break; case '\r': buffer.append(" "); break; case '\t': buffer.append(" "); break; case '"': if (htmlDoc) buffer.append("%22"); else buffer.append("""); break; case '<': buffer.append("<"); break; case '>': buffer.append(">"); break; case '&': buffer.append("&"); break; default: buffer.append(c[i]); } } return buffer.toString(); } public void leave(Attr attr) { // no-op } public boolean enter(CDATASection cdata) { buffer.append(""); return true; } public void leave(CDATASection cdata) { // no-op } public boolean enter(Comment comment) { if (canonical) { c14nNodeList.add(comment); if (!with_comments) return true; } buffer.append(""); return true; } public void leave(Comment comment) { // no-op } public boolean enter(Document document) { if (!noDecl) { buffer.append("\n"); } return true; } public void leave(Document document) { // no-op } public boolean enter(DocumentType docType) { if (canonical) { c14nNodeList.add(docType); return true; } String name = docType.getName(); String pubId = docType.getPublicId(); String sysId = docType.getSystemId(); String internalSubset = docType.getInternalSubset(); if (docType.getPreviousSibling() != null) { buffer.append("\n"); } buffer.append("\n"); return true; } public void leave(DocumentType docType) { // no-op } public boolean enter(Element element) { if (canonical) { c14nNodeList.add(element); if (element == element.getOwnerDocument().getDocumentElement()) { c14nNodeList.add(element.getOwnerDocument()); } } String current = indentation.peek(); buffer.append(current); if (needIndent()) { indentation.push(current + indentString); } String name = element.getTagName(); buffer.append("<" + name); Attr[] attrs = getAttrsAndNamespaces(element); for (Attr attr : attrs) { if (attr.getSpecified()) { buffer.append(" "); enter(attr); leave(attr); } } if (element.hasChildNodes()) { buffer.append(">"); if (needBreakInOpening(element)) buffer.append("\n"); return true; } // no child if (asHtml || asXhtml) { buffer.append(">"); } else if (asXml && noEmpty) { buffer.append(">"); } else { buffer.append("/>"); } if (needBreakInOpening(element)) { buffer.append("\n"); } return true; } private boolean needIndent() { if (fragment) return false; // a given option might be fragment and format. fragment matters if (format || asBuilder) return true; return false; } private boolean needBreakInOpening(Element element) { if (fragment) return false; if (format) return true; if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true; if (format && element.getNextSibling() == null && element.hasChildNodes()) return true; return false; } private boolean isEmpty(String name) { HTMLElements.Element element = HTMLElements.getElement(name); return element.isEmpty(); } private Attr[] getAttrsAndNamespaces(Element element) { NamedNodeMap attrs = element.getAttributes(); if (!canonical) { if (attrs == null || attrs.getLength() == 0) return new Attr[0]; Attr[] attrsAndNamespaces = new Attr[attrs.getLength()]; for (int i=0; i namespaces = new ArrayList(); List attributes = new ArrayList(); if (subsets) { getAttrsOfAncestors(element.getParentNode(), namespaces, attributes); Attr[] namespaceOfAncestors = getSortedArray(namespaces); Attr[] attributeOfAncestors = getSortedArray(attributes); c14nNamespaceStack.push(namespaceOfAncestors); c14nAttrStack.push(attributeOfAncestors); subsets = false; // namespace propagation should be done only once on top level node. } getNamespacesAndAttrs(element, namespaces, attributes); Attr[] namespaceArray = getSortedArray(namespaces); Attr[] attributeArray = getSortedArray(attributes); Attr[] allAttrs = new Attr[namespaceArray.length + attributeArray.length]; for (int i=0; i namespaces, List attributes) { if (parent == null) return; NamedNodeMap attrs = parent.getAttributes(); if (attrs == null || attrs.getLength() == 0) return; for (int i=0; i < attrs.getLength(); i++) { Attr attr = (Attr)attrs.item(i); if (isNamespace(attr.getNodeName())) namespaces.add(attr); else attributes.add(attr); } getAttrsOfAncestors(parent.getParentNode(), namespaces, attributes); } private void getNamespacesAndAttrs(Node current, List namespaces, List attributes) { NamedNodeMap attrs = current.getAttributes(); for (int i=0; i namespaces, Attr attr) { boolean newNamespace = true; Iterator iter = c14nNamespaceStack.iterator(); while (iter.hasNext()) { Attr[] parentNamespaces = iter.next(); for (int n=0; n < parentNamespaces.length; n++) { if (parentNamespaces[n].getNodeName().equals(attr.getNodeName())) { if (parentNamespaces[n].getNodeValue().equals(attr.getNodeValue())) { // exactly the same namespace should not be added newNamespace = false; } else { // in case of namespace url change, propagated namespace will be override namespaces.remove(parentNamespaces[n]); } } } if (newNamespace && !namespaces.contains(attr)) namespaces.add(attr); } } private void getAttributesWithPropagated(List attributes, Attr attr) { boolean newAttribute = true; Iterator iter = c14nAttrStack.iterator(); while (iter.hasNext()) { Attr[] parentAttr = iter.next(); for (int n=0; n < parentAttr.length; n++) { if (!parentAttr[n].getNodeName().startsWith("xml:")) continue; if (parentAttr[n].getNodeName().equals(attr.getNodeName())) { if (parentAttr[n].getNodeValue().equals(attr.getNodeValue())) { // exactly the same attribute should not be added newAttribute = false; } else { // in case of attribute value change, propagated attribute will be override attributes.remove(parentAttr[n]); } } } if (newAttribute) attributes.add(attr); } } private void verifyXmlSpace(List attributes, NamedNodeMap attrs) { Attr attr = (Attr) attrs.getNamedItem("xml:space"); if (attr == null) { for (int i=0; i < attributes.size(); i++) { if (attributes.get(i).getNodeName().equals("xml:space")) { attributes.remove(i); break; } } } } private Attr[] getSortedArray(List attrList) { Attr[] attrArray = attrList.toArray(new Attr[0]); Arrays.sort(attrArray, new Comparator() { @Override public int compare(Attr attr0, Attr attr1) { return attr0.getNodeName().compareTo(attr1.getNodeName()); } }); return attrArray; } public void leave(Element element) { if (canonical) { c14nNamespaceStack.poll(); c14nAttrStack.poll(); } String name = element.getTagName(); if (element.hasChildNodes()) { if (needIndentInClosing(element)) { indentation.pop(); buffer.append(indentation.peek()); } else if (asBuilder) { indentation.pop(); } buffer.append(""); if (needBreakInClosing()) { buffer.append("\n"); } return; } // no child, but HTML might need a closing tag. if (asHtml || noEmpty) { if (!isEmpty(name) && noEmpty) { buffer.append(""); } } if (needBreakInClosing()) { indentation.pop(); buffer.append("\n"); } } private boolean needIndentInClosing(Element element) { if (fragment) return false; // a given option might be fragment and format. fragment matters if (format) return true; if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true; return false; } private boolean needBreakInClosing() { if (fragment) return false; if (format || asBuilder) return true; return false; } public boolean enter(Entity entity) { String name = entity.getNodeName(); String pubId = entity.getPublicId(); String sysId = entity.getSystemId(); String notation = entity.getNotationName(); buffer.append(""); return true; } public void leave(Entity entity) { // no-op } public boolean enterEntityReference(Text entityRef) { String name = entityRef.getNodeName(); buffer.append("&" + name + ";"); return true; } public void leaveEntityReference(Text entityRef) { // no-op } public boolean enter(Notation notation) { String name = notation.getNodeName(); String pubId = notation.getPublicId(); String sysId = notation.getSystemId(); buffer.append(""); return true; } public void leave(Notation notation) { // no-op } public boolean enter(ProcessingInstruction pi) { buffer.append(""); else buffer.append("?>"); buffer.append("\n"); if (canonical) c14nNodeList.add(pi); return true; } public void leave(ProcessingInstruction pi) { // no-op } private static char lineSeparator = '\n'; // System.getProperty("line.separator"); ? public boolean enter(Text text) { String textContent = text.getNodeValue(); if (canonical) { c14nNodeList.add(text); if (isWhitespaceText(textContent)) { buffer.append(canonicalizeWhitespce(textContent)); return true; } } if (needIndentText() && "".equals(textContent.trim())) return true; if (needIndentText()) { String current = indentation.peek(); buffer.append(current); indentation.push(current + indentString); if (textContent.charAt(0) == lineSeparator) textContent = textContent.substring(1); } if (text.getUserData(NokogiriHelpers.ENCODED_STRING) == null || !((Boolean)text.getUserData(NokogiriHelpers.ENCODED_STRING))) { textContent = encodeJavaString(textContent); } if (getEncoding(text) == null) { textContent = encodeStringToHtmlEntity(textContent); } buffer.append(textContent); return true; } private boolean needIndentText() { if (fragment) return false; if (format) return true; return false; } public void leave(Text text) { String textContent = text.getNodeValue(); if (needIndentText() && !"".equals(textContent.trim())) { indentation.pop(); if (textContent.charAt(textContent.length()-1) != lineSeparator) { buffer.append("\n"); } } } private String getEncoding(Text text) { if (encoding != null) return encoding; encoding = text.getOwnerDocument().getInputEncoding(); return encoding; } private String encodeStringToHtmlEntity(String text) { int last = 126; // = U+007E. No need to encode under U+007E. StringBuffer sb = new StringBuffer(); for (int i=0; i last) sb.append("&#x" + Integer.toHexString(codePoint) + ";"); else sb.append(text.charAt(i)); } return new String(sb); } }