package nokogiri; import static nokogiri.internals.NokogiriHelpers.clearXpathContext; import static nokogiri.internals.NokogiriHelpers.getCachedNodeOrCreate; import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; import static nokogiri.internals.NokogiriHelpers.isNamespace; import static nokogiri.internals.NokogiriHelpers.rubyStringToString; import static nokogiri.internals.NokogiriHelpers.stringOrNil; import java.util.List; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.jcodings.specific.USASCIIEncoding; import org.jcodings.specific.UTF8Encoding; import org.jruby.Ruby; import org.jruby.RubyArray; import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; import org.jruby.javasupport.JavaUtil; import org.jruby.runtime.Block; import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.Visibility; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.ByteList; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.DocumentType; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import nokogiri.internals.NokogiriHelpers; import nokogiri.internals.NokogiriNamespaceCache; import nokogiri.internals.SaveContextVisitor; import nokogiri.internals.XmlDomParserContext; import nokogiri.internals.c14n.CanonicalFilter; import nokogiri.internals.c14n.CanonicalizationException; import nokogiri.internals.c14n.Canonicalizer; /** * Class for Nokogiri::XML::Document * * @author sergio * @author Yoko Harada * @author John Shahid */ @JRubyClass(name = "Nokogiri::XML::Document", parent = "Nokogiri::XML::Node") public class XmlDocument extends XmlNode { private static final long serialVersionUID = 1L; private NokogiriNamespaceCache nsCache; /* UserData keys for storing extra info in the document node. */ public final static String DTD_RAW_DOCUMENT = "DTD_RAW_DOCUMENT"; public final static String DTD_INTERNAL_SUBSET = "DTD_INTERNAL_SUBSET"; public final static String DTD_EXTERNAL_SUBSET = "DTD_EXTERNAL_SUBSET"; /* DocumentBuilderFactory implementation class name. This needs to set a classloader into it. * Setting an appropriate classloader resolves issue 380. */ private static final String DOCUMENTBUILDERFACTORY_IMPLE_NAME = "org.apache.xerces.jaxp.DocumentBuilderFactoryImpl"; private static final ByteList DOCUMENT = ByteList.create("document"); static { DOCUMENT.setEncoding(USASCIIEncoding.INSTANCE); } private static boolean substituteEntities = false; private static boolean loadExternalSubset = false; // TODO: Verify this. /** cache variables */ protected IRubyObject encoding; protected IRubyObject url; public XmlDocument(Ruby runtime, RubyClass klazz) { super(runtime, klazz, createNewDocument(runtime)); } public XmlDocument(Ruby runtime, Document document) { this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document); } public XmlDocument(Ruby runtime, RubyClass klass, Document document) { super(runtime, klass, document); init(runtime, document); } void init(Ruby runtime, Document document) { stabilizeTextContent(document); if (document.getDocumentElement() != null) { createAndCacheNamespaces(runtime, document.getDocumentElement()); } setInstanceVariable("@decorators", runtime.getNil()); } public final void setDocumentNode(Ruby runtime, Document node) { super.setNode(runtime, node); if (node != null) { init(runtime, node); } else { setInstanceVariable("@decorators", runtime.getNil()); } } public void setEncoding(IRubyObject encoding) { this.encoding = encoding; } public IRubyObject getEncoding() { return encoding; } // not sure, but like attribute values, text value will be lost // unless it is referred once before this document is used. // this seems to happen only when the fragment is parsed from Node#in_context. protected static void stabilizeTextContent(Document document) { if (document.getDocumentElement() != null) { document.getDocumentElement().getTextContent(); } } private static void createAndCacheNamespaces(Ruby runtime, Node node) { if (node.hasAttributes()) { NamedNodeMap nodeMap = node.getAttributes(); for (int i = 0; i < nodeMap.getLength(); i++) { Node n = nodeMap.item(i); if (n instanceof Attr) { Attr attr = (Attr) n; stabilizeAttr(attr); if (isNamespace(attr.getName())) { // create and cache XmlNamespace.createFromAttr(runtime, attr); } } } } NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { createAndCacheNamespaces(runtime, children.item(i)); } } static void stabilizeAttr(final Attr attr) { // TODO not sure, but need to get value always before document is referred or lose attribute value attr.getValue(); // don't delete this line } // When a document is created from fragment with a context (reference) document, // namespace should be resolved based on the context document. public XmlDocument(Ruby ruby, RubyClass klass, Document document, XmlDocument contextDoc) { super(ruby, klass, document); nsCache = contextDoc.getNamespaceCache(); String default_href = nsCache.getDefault().getHref(); resolveNamespaceIfNecessary(document.getDocumentElement(), default_href); } private void resolveNamespaceIfNecessary(Node node, String default_href) { if (node == null) { return; } String nodePrefix = node.getPrefix(); if (nodePrefix == null) { // default namespace NokogiriHelpers.renameNode(node, default_href, node.getNodeName()); } else { String href = getNamespaceCache().get(node, nodePrefix).getHref(); NokogiriHelpers.renameNode(node, href, node.getNodeName()); } resolveNamespaceIfNecessary(node.getNextSibling(), default_href); NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { resolveNamespaceIfNecessary(children.item(i), default_href); } } public NokogiriNamespaceCache getNamespaceCache() { if (nsCache == null) { nsCache = new NokogiriNamespaceCache(); } return nsCache; } public Document getDocument() { return (Document) node; } @Override protected IRubyObject getNodeName(ThreadContext context) { if (name == null) { name = RubyString.newStringShared(context.runtime, DOCUMENT); } return name; } public void setUrl(IRubyObject url) { this.url = url; } protected IRubyObject getUrl() { return this.url; } @JRubyMethod public IRubyObject url(ThreadContext context) { return getUrl(); } public static Document createNewDocument(final Ruby runtime) { try { return DocumentBuilderFactoryHolder.INSTANCE.newDocumentBuilder().newDocument(); } catch (ParserConfigurationException e) { throw asRuntimeError(runtime, null, e); } } private static class DocumentBuilderFactoryHolder { static final DocumentBuilderFactory INSTANCE; static { INSTANCE = DocumentBuilderFactory.newInstance(DOCUMENTBUILDERFACTORY_IMPLE_NAME, NokogiriService.class.getClassLoader()); } } static RaiseException asRuntimeError(Ruby runtime, String message, Exception cause) { if (cause instanceof RaiseException) { return (RaiseException) cause; } if (message == null) { message = cause.toString(); } else { message = message + '(' + cause.toString() + ')'; } RaiseException ex = runtime.newRuntimeError(message); ex.initCause(cause); return ex; } /* * call-seq: * new(version = default) * * Create a new document with +version+ (defaults to "1.0") */ @JRubyMethod(name = "new", meta = true, rest = true, required = 0) public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) { final Ruby runtime = context.runtime; XmlDocument xmlDocument; try { Document docNode = createNewDocument(runtime); if ("Nokogiri::HTML4::Document".equals(((RubyClass)klazz).getName())) { xmlDocument = new Html4Document(context.runtime, (RubyClass) klazz, docNode); } else { xmlDocument = new XmlDocument(context.runtime, (RubyClass) klazz, docNode); } } catch (Exception ex) { throw asRuntimeError(runtime, "couldn't create document: ", ex); } Helpers.invoke(context, xmlDocument, "initialize", args); return xmlDocument; } @JRubyMethod(required = 1, optional = 4) public IRubyObject create_entity(ThreadContext context, IRubyObject[] argv) { // FIXME: Entity node should be create by some right way. // this impl passes tests, but entity doesn't exists in DTD, which // would cause validation failure. if (argv.length == 0) { throw context.runtime.newRuntimeError("Could not create entity"); } String tagName = rubyStringToString(argv[0]); Node node = getOwnerDocument().createElement(tagName); return XmlEntityDecl.create(context, node, argv); } @Override XmlDocument document(Ruby runtime) { return this; } @JRubyMethod(name = "encoding=") public IRubyObject encoding_set(IRubyObject encoding) { this.encoding = encoding; return this; } @JRubyMethod public IRubyObject encoding(ThreadContext context) { if (this.encoding == null || this.encoding.isNil()) { final String enc = getDocument().getXmlEncoding(); if (enc == null) { this.encoding = context.nil; } else { this.encoding = context.runtime.newString(enc); } } return this.encoding.isNil() ? this.encoding : this.encoding.asString().encode(context, context.getRuntime().newString("UTF-8")); } @JRubyMethod(meta = true) public static IRubyObject load_external_subsets_set(ThreadContext context, IRubyObject cls, IRubyObject value) { XmlDocument.loadExternalSubset = value.isTrue(); return context.nil; } @JRubyMethod(meta = true, required = 4) public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) { XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]); ctx.setIOInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } @JRubyMethod(meta = true, required = 4) public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) { XmlDomParserContext ctx = new XmlDomParserContext(context.runtime, args[2], args[3]); ctx.setStringInputSource(context, args[0], args[1]); return ctx.parse(context, (RubyClass) klass, args[1]); } @JRubyMethod(name = "remove_namespaces!") public IRubyObject remove_namespaces(ThreadContext context) { removeNamespaceRecursively(this); if (nsCache != null) { nsCache.clear(); } clearXpathContext(getNode()); return this; } private void removeNamespaceRecursively(XmlNode xmlNode) { Node node = xmlNode.node; if (node.getNodeType() == Node.ELEMENT_NODE) { node.setPrefix(null); NokogiriHelpers.renameNode(node, null, node.getLocalName()); NamedNodeMap attrs = node.getAttributes(); for (int i = 0; i < attrs.getLength(); i++) { Attr attr = (Attr) attrs.item(i); if (isNamespace(attr.getNodeName())) { ((org.w3c.dom.Element) node).removeAttributeNode(attr); } else { attr.setPrefix(null); NokogiriHelpers.renameNode(attr, null, attr.getLocalName()); } } } IRubyObject[] nodes = xmlNode.getChildren(); for (int i = 0; i < nodes.length; i++) { XmlNode childNode = (XmlNode) nodes[i]; removeNamespaceRecursively(childNode); } } @JRubyMethod public IRubyObject root(ThreadContext context) { Node rootNode = getDocument().getDocumentElement(); if (rootNode == null) { return context.nil; } Object invalid = rootNode.getUserData(NokogiriHelpers.ROOT_NODE_INVALID); if (invalid != null && ((Boolean) invalid)) { return context.nil; } return getCachedNodeOrCreate(context.runtime, rootNode); } @JRubyMethod(visibility = Visibility.PROTECTED) public IRubyObject initialize_copy_with_args(ThreadContext context, IRubyObject other, IRubyObject level) { super.initialize_copy_with_args(context, other, level, null); resetCache(); return this; } @JRubyMethod(name = "root=") public IRubyObject root_set(ThreadContext context, IRubyObject new_root) { // in case of document fragment, temporary root node should be deleted. // Java can't have a root whose value is null. Instead of setting null, // the method sets user data so that other methods are able to know the root // should be nil. if (new_root == context.nil) { getDocument().getDocumentElement().setUserData(NokogiriHelpers.ROOT_NODE_INVALID, Boolean.TRUE, null); return new_root; } if (!(new_root instanceof XmlNode)) { throw context.runtime.newArgumentError("expected Nokogiri::XML::Node but received " + new_root.getType()); } XmlNode newRoot = asXmlNode(context, new_root); IRubyObject root = root(context); if (root.isNil()) { Node newRootNode; if (getDocument() == newRoot.getOwnerDocument()) { newRootNode = newRoot.node; } else { // must copy otherwise newRoot may exist in two places // with different owner document. newRootNode = getDocument().importNode(newRoot.node, true); } add_child_node(context, getCachedNodeOrCreate(context.runtime, newRootNode)); } else { Node rootNode = asXmlNode(context, root).node; ((XmlNode) getCachedNodeOrCreate(context.runtime, rootNode)).replace_node(context, newRoot); } return newRoot; } @JRubyMethod public IRubyObject version(ThreadContext context) { return stringOrNil(context.runtime, getDocument().getXmlVersion()); } @JRubyMethod(meta = true) public static IRubyObject substitute_entities_set(ThreadContext context, IRubyObject cls, IRubyObject value) { XmlDocument.substituteEntities = value.isTrue(); return context.nil; } public IRubyObject getInternalSubset(ThreadContext context) { IRubyObject dtd = (IRubyObject) node.getUserData(DTD_INTERNAL_SUBSET); if (dtd == null) { Document document = getDocument(); if (document.getUserData(XmlDocument.DTD_RAW_DOCUMENT) != null) { dtd = XmlDtd.newFromInternalSubset(context.runtime, document); } else if (document.getDoctype() != null) { DocumentType docType = document.getDoctype(); IRubyObject name, publicId, systemId; name = publicId = systemId = context.nil; if (docType.getName() != null) { name = context.runtime.newString(docType.getName()); } if (docType.getPublicId() != null) { publicId = context.runtime.newString(docType.getPublicId()); } if (docType.getSystemId() != null) { systemId = context.runtime.newString(docType.getSystemId()); } dtd = XmlDtd.newEmpty(context.runtime, document, name, publicId, systemId); } else { dtd = context.nil; } setInternalSubset(dtd); } return dtd; } /** * Assumes XmlNode#internal_subset() has returned nil. (i.e. there * is not already an internal subset). */ public IRubyObject createInternalSubset(ThreadContext context, IRubyObject name, IRubyObject external_id, IRubyObject system_id) { XmlDtd dtd = XmlDtd.newEmpty(context.runtime, getDocument(), name, external_id, system_id); setInternalSubset(dtd); return dtd; } protected void setInternalSubset(IRubyObject data) { node.setUserData(DTD_INTERNAL_SUBSET, data, null); } public IRubyObject getExternalSubset(ThreadContext context) { IRubyObject dtd = (IRubyObject) node.getUserData(DTD_EXTERNAL_SUBSET); if (dtd == null) { return context.nil; } return dtd; } /** * Assumes XmlNode#external_subset() has returned nil. (i.e. there * is not already an external subset). */ public IRubyObject createExternalSubset(ThreadContext context, IRubyObject name, IRubyObject external_id, IRubyObject system_id) { XmlDtd dtd = XmlDtd.newEmpty(context.runtime, getDocument(), name, external_id, system_id); setExternalSubset(dtd); return dtd; } protected void setExternalSubset(IRubyObject data) { node.setUserData(DTD_EXTERNAL_SUBSET, data, null); } @Override public void accept(ThreadContext context, SaveContextVisitor visitor) { Document document = getDocument(); visitor.enter(document); NodeList children = document.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); short type = child.getNodeType(); if (type == Node.COMMENT_NODE) { XmlComment xmlComment = (XmlComment) getCachedNodeOrCreate(context.runtime, child); xmlComment.accept(context, visitor); } else if (type == Node.DOCUMENT_TYPE_NODE) { XmlDtd xmlDtd = (XmlDtd) getCachedNodeOrCreate(context.runtime, child); xmlDtd.accept(context, visitor); } else if (type == Node.PROCESSING_INSTRUCTION_NODE) { XmlProcessingInstruction xmlProcessingInstruction = (XmlProcessingInstruction) getCachedNodeOrCreate(context.runtime, child); xmlProcessingInstruction.accept(context, visitor); } else if (type == Node.TEXT_NODE) { XmlText xmlText = (XmlText) getCachedNodeOrCreate(context.runtime, child); xmlText.accept(context, visitor); } else if (type == Node.ELEMENT_NODE) { XmlElement xmlElement = (XmlElement) getCachedNodeOrCreate(context.runtime, child); xmlElement.accept(context, visitor); } } visitor.leave(document); } @JRubyMethod(meta = true) public static IRubyObject wrap(ThreadContext context, IRubyObject klass, IRubyObject arg) { XmlDocument xmlDocument = new XmlDocument(context.runtime, (RubyClass) klass, arg.toJava(Document.class)); Helpers.invoke(context, xmlDocument, "initialize"); return xmlDocument; } @Deprecated @JRubyMethod(meta = true, visibility = Visibility.PRIVATE) public static IRubyObject wrapJavaDocument(ThreadContext context, IRubyObject klass, IRubyObject arg) { return wrap(context, klass, arg); } @Deprecated // default to_java works (due inherited from XmlNode#toJava) @JRubyMethod(visibility = Visibility.PRIVATE) public IRubyObject toJavaDocument(ThreadContext context) { return JavaUtil.convertJavaToUsableRubyObject(context.getRuntime(), node); } /* call-seq: * doc.canonicalize(mode=XML_C14N_1_0,inclusive_namespaces=nil,with_comments=false) * doc.canonicalize { |obj, parent| ... } * * Canonicalize a document and return the results. Takes an optional block * that takes two parameters: the +obj+ and that node's +parent+. * The +obj+ will be either a Nokogiri::XML::Node, or a Nokogiri::XML::Namespace * The block must return a non-nil, non-false value if the +obj+ passed in * should be included in the canonicalized document. */ @JRubyMethod(optional = 3) public IRubyObject canonicalize(ThreadContext context, IRubyObject[] args, Block block) { int mode = 0; String inclusive_namespace = null; Boolean with_comments = false; if (args.length > 0 && !(args[0].isNil())) { mode = RubyFixnum.fix2int(args[0]); } if (args.length > 1) { if (!args[1].isNil() && !(args[1] instanceof List)) { throw context.runtime.newTypeError("Expected array"); } if (!args[1].isNil()) { inclusive_namespace = ((RubyArray)args[1]) .join(context, context.runtime.newString(" ")) .asString() .asJavaString(); // OMG I wish I knew JRuby better, this is ugly } } if (args.length > 2) { with_comments = args[2].isTrue(); } String algorithmURI = null; switch (mode) { case 0: // XML_C14N_1_0 if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N_WITH_COMMENTS; } else { algorithmURI = Canonicalizer.ALGO_ID_C14N_OMIT_COMMENTS; } break; case 1: // XML_C14N_EXCLUSIVE_1_0 if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N_EXCL_WITH_COMMENTS; } else { algorithmURI = Canonicalizer.ALGO_ID_C14N_EXCL_OMIT_COMMENTS; } break; case 2: // XML_C14N_1_1 = 2 if (with_comments) { algorithmURI = Canonicalizer.ALGO_ID_C14N11_WITH_COMMENTS; } else { algorithmURI = Canonicalizer.ALGO_ID_C14N11_OMIT_COMMENTS; } } try { Canonicalizer canonicalizer = Canonicalizer.getInstance(algorithmURI); XmlNode startingNode = getStartingNode(block); byte[] result; CanonicalFilter filter = new CanonicalFilter(context, block); if (inclusive_namespace == null) { result = canonicalizer.canonicalizeSubtree(startingNode.getNode(), filter); } else { result = canonicalizer.canonicalizeSubtree(startingNode.getNode(), inclusive_namespace, filter); } return RubyString.newString(context.runtime, new ByteList(result, UTF8Encoding.INSTANCE)); } catch (Exception e) { throw context.getRuntime().newRuntimeError(e.getMessage()); } } private XmlNode getStartingNode(Block block) { if (block.isGiven()) { IRubyObject boundSelf = block.getBinding().getSelf(); if (boundSelf instanceof XmlNode) { return (XmlNode) boundSelf; } } return this; } public void resetNamespaceCache(ThreadContext context) { nsCache = new NokogiriNamespaceCache(); createAndCacheNamespaces(context.runtime, node); } }