/** * (The MIT License) * * Copyright (c) 2008 - 2012: * * * {Aaron Patterson}[http://tenderlovemaking.com] * * {Mike Dalessio}[http://mike.daless.io] * * {Charles Nutter}[http://blog.headius.com] * * {Sergio Arbeo}[http://www.serabe.com] * * {Patrick Mahoney}[http://polycrystal.org] * * {Yoko Harada}[http://yokolet.blogspot.com] * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * 'Software'), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package nokogiri.internals; import java.io.File; import java.io.UnsupportedEncodingException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import nokogiri.HtmlDocument; import nokogiri.NokogiriService; import nokogiri.XmlAttr; import nokogiri.XmlCdata; import nokogiri.XmlComment; import nokogiri.XmlDocument; import nokogiri.XmlDtd; import nokogiri.XmlElement; import nokogiri.XmlEntityReference; import nokogiri.XmlNamespace; import nokogiri.XmlNode; import nokogiri.XmlProcessingInstruction; import nokogiri.XmlText; import org.jcodings.specific.UTF8Encoding; import org.jruby.Ruby; import org.jruby.RubyArray; import org.jruby.RubyClass; import org.jruby.RubyEncoding; import org.jruby.RubyString; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.ByteList; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.DOMException; /** * A class for various utility methods. * * @author serabe * @author Patrick Mahoney * @author Yoko Harada */ public class NokogiriHelpers { public static final String CACHED_NODE = "NOKOGIRI_CACHED_NODE"; public static final String VALID_ROOT_NODE = "NOKOGIRI_VALIDE_ROOT_NODE"; public static final String ENCODED_STRING = "NOKOGIRI_ENCODED_STRING"; public static XmlNode getCachedNode(Node node) { return (XmlNode) node.getUserData(CACHED_NODE); } /** * Get the XmlNode associated with the underlying * node. Creates a new XmlNode (or appropriate subclass) * or XmlNamespace wrapping node if there is no cached * value. */ public static IRubyObject getCachedNodeOrCreate(Ruby ruby, Node node) { if(node == null) return ruby.getNil(); if (node.getNodeType() == Node.ATTRIBUTE_NODE && isNamespace(node.getNodeName())) { XmlDocument xmlDocument = (XmlDocument)node.getOwnerDocument().getUserData(CACHED_NODE); if (!(xmlDocument instanceof HtmlDocument)) { String prefix = getLocalNameForNamespace(((Attr)node).getName()); prefix = prefix != null ? prefix : ""; String href = ((Attr)node).getValue(); XmlNamespace xmlNamespace = xmlDocument.getNamespaceCache().get(prefix, href); if (xmlNamespace != null) return xmlNamespace; else return XmlNamespace.createFromAttr(ruby, (Attr)node); } } XmlNode xmlNode = getCachedNode(node); if(xmlNode == null) { xmlNode = (XmlNode)constructNode(ruby, node); node.setUserData(CACHED_NODE, xmlNode, null); } return xmlNode; } /** * Construct a new XmlNode wrapping node. The proper * subclass of XmlNode is chosen based on the type of * node. */ public static IRubyObject constructNode(Ruby runtime, Node node) { if (node == null) return runtime.getNil(); // this is slow; need a way to cache nokogiri classes/modules somewhere switch (node.getNodeType()) { case Node.ELEMENT_NODE: XmlElement xmlElement = (XmlElement) NokogiriService.XML_ELEMENT_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Element")); xmlElement.setNode(runtime.getCurrentContext(), node); return xmlElement; case Node.ATTRIBUTE_NODE: XmlAttr xmlAttr = (XmlAttr) NokogiriService.XML_ATTR_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Attr")); xmlAttr.setNode(runtime.getCurrentContext(), node); return xmlAttr; case Node.TEXT_NODE: XmlText xmlText = (XmlText) NokogiriService.XML_TEXT_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Text")); xmlText.setNode(runtime.getCurrentContext(), node); return xmlText; case Node.COMMENT_NODE: XmlComment xmlComment = (XmlComment) NokogiriService.XML_COMMENT_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Comment")); xmlComment.setNode(runtime.getCurrentContext(), node); return xmlComment; case Node.ENTITY_NODE: return new XmlNode(runtime, getNokogiriClass(runtime, "Nokogiri::XML::EntityDecl"), node); case Node.ENTITY_REFERENCE_NODE: XmlEntityReference xmlEntityRef = (XmlEntityReference) NokogiriService.XML_ENTITY_REFERENCE_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::EntityReference")); xmlEntityRef.setNode(runtime.getCurrentContext(), node); return xmlEntityRef; case Node.PROCESSING_INSTRUCTION_NODE: XmlProcessingInstruction xmlProcessingInstruction = (XmlProcessingInstruction) NokogiriService.XML_PROCESSING_INSTRUCTION_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::ProcessingInstruction")); xmlProcessingInstruction.setNode(runtime.getCurrentContext(), node); return xmlProcessingInstruction; case Node.CDATA_SECTION_NODE: XmlCdata xmlCdata = (XmlCdata) NokogiriService.XML_CDATA_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::CDATA")); xmlCdata.setNode(runtime.getCurrentContext(), node); return xmlCdata; case Node.DOCUMENT_NODE: XmlDocument xmlDocument = (XmlDocument) NokogiriService.XML_DOCUMENT_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document")); xmlDocument.setDocumentNode(runtime.getCurrentContext(), node); return xmlDocument; case Node.DOCUMENT_TYPE_NODE: XmlDtd xmlDtd = (XmlDtd) NokogiriService.XML_DTD_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::DTD")); xmlDtd.setNode(runtime, node); return xmlDtd; default: XmlNode xmlNode = (XmlNode) NokogiriService.XML_NODE_ALLOCATOR.allocate(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Node")); xmlNode.setNode(runtime.getCurrentContext(), node); return xmlNode; } } public static RubyClass getNokogiriClass(Ruby ruby, String name) { return NokogiriService.nokogiriClassCache.get(name); } public static IRubyObject stringOrNil(Ruby runtime, String s) { if (s == null) return runtime.getNil(); return convertJavaStringToRuby(runtime, s); } public static IRubyObject stringOrNil(Ruby runtime, byte[] bytes) { if (bytes == null) return runtime.getNil(); return RubyString.newString(runtime, bytes); } public static IRubyObject stringOrBlank(Ruby runtime, String s) { if (s == null) return runtime.newString(); return convertJavaStringToRuby(runtime, s); } private static IRubyObject convertJavaStringToRuby(Ruby runtime, String str) { if (runtime.is1_9()) { ByteList bytes = new ByteList(str.getBytes(RubyEncoding.UTF8), UTF8Encoding.INSTANCE); return RubyString.newString(runtime, bytes); } else { return RubyString.newString(runtime, str); } } /** * Convert s to a RubyString, or if s is null or * empty return RubyNil. */ public static IRubyObject nonEmptyStringOrNil(Ruby runtime, String s) { if (s == null || s.length() == 0) return runtime.getNil(); return RubyString.newString(runtime, s); } /** * Return the prefix of a qualified name like "prefix:local". * Returns null if there is no prefix. */ public static String getPrefix(String qName) { if (qName == null) return null; int pos = qName.indexOf(':'); if (pos > 0) return qName.substring(0, pos); else return null; } /** * Return the local part of a qualified name like "prefix:local". * Returns qName if there is no prefix. */ public static String getLocalPart(String qName) { if (qName == null) return null; int pos = qName.indexOf(':'); if (pos > 0) return qName.substring(pos + 1); else return qName; } public static String getLocalNameForNamespace(String name) { String localName = getLocalPart(name); return ("xmlns".equals(localName)) ? null : localName; } private static Charset utf8 = null; private static Charset getCharsetUTF8() { if (utf8 == null) utf8 = Charset.forName("UTF-8"); return utf8; } /** * Converts a RubyString in to a Java String. Assumes the * RubyString is encoded as UTF-8. This is generally the case for * RubyStrings created with getRuntime().newString("java string"). * It also seems to be the case for strings created within Ruby * where $KCODE has not been set. * * Note that RubyString#toString() decodes the string data as * ISO-8859-1 (See org.jruby.util.ByteList.java). This is not * what you want if you have any multibyte characters in your * UTF-8 string. * * FIXME: This really needs to be more robust in terms of * detecting the encoding and properly converting to a Java * String. It's unfortunate that RubyString#toString() doesn't do * this for us. */ public static String rubyStringToString(IRubyObject str) { if (str.isNil()) return null; //return rubyStringToString(str.convertToString()); return toJavaString(str.convertToString()); } private static String toJavaString(RubyString str) { ByteList value = str.getByteList(); try { if (str.getRuntime().is1_9()) { return new String(value.getUnsafeBytes(), value.begin(), value.length(), str.getEncoding().toString()); } return RubyEncoding.decodeUTF8(value.getUnsafeBytes(), value.begin(), value.length()); } catch (UnsupportedEncodingException uee) { return str.toString(); } } public static String rubyStringToString(RubyString str) { ByteList byteList = str.getByteList(); byte[] data = byteList.unsafeBytes(); int offset = byteList.begin(); int len = byteList.length(); ByteBuffer buf = ByteBuffer.wrap(data, offset, len); return getCharsetUTF8().decode(buf).toString(); } public static List rubyStringArrayToJavaList(RubyArray ary) { List list = new ArrayList(); for (int i=0; i < ary.getLength(); i++) { Object obj = ary.get(i); if (obj != null) list.add(obj.toString()); } return list; } public static String getNodeCompletePath(Node node) { Node cur, tmp, next; // TODO: Rename buffer to path. String buffer = ""; String sep; String name; int occur = 0; boolean generic; cur = node; do { name = ""; sep = "?"; occur = 0; generic = false; if(cur.getNodeType() == Node.DOCUMENT_NODE) { if(buffer.startsWith("/")) break; sep = "/"; next = null; } else if(cur.getNodeType() == Node.ELEMENT_NODE) { generic = false; sep = "/"; name = cur.getLocalName(); if (name == null) name = cur.getNodeName(); if(cur.getNamespaceURI() != null) { if(cur.getPrefix() != null) { name = cur.getPrefix() + ":" + name; } else { generic = true; name = "*"; } } next = cur.getParentNode(); /* * Thumbler index computation */ tmp = cur.getPreviousSibling(); while(tmp != null) { if((tmp.getNodeType() == Node.ELEMENT_NODE) && (generic || fullNamesMatch(tmp, cur))) { occur++; } tmp = tmp.getPreviousSibling(); } if(occur == 0) { tmp = cur.getNextSibling(); while(tmp != null && occur == 0) { if((tmp.getNodeType() == Node.ELEMENT_NODE) && (generic || fullNamesMatch(tmp,cur))) { occur++; } tmp = tmp.getNextSibling(); } if(occur != 0) occur = 1; } else { occur++; } } else if(cur.getNodeType() == Node.COMMENT_NODE) { sep = "/"; name = "comment()"; next = cur.getParentNode(); /* * Thumbler index computation. */ tmp = cur.getPreviousSibling(); while(tmp != null) { if(tmp.getNodeType() == Node.COMMENT_NODE) { occur++; } tmp = tmp.getPreviousSibling(); } if(occur == 0) { tmp = cur.getNextSibling(); while(tmp != null && occur == 0) { if(tmp.getNodeType() == Node.COMMENT_NODE) { occur++; } tmp = tmp.getNextSibling(); } if(occur != 0) occur = 1; } else { occur = 1; } } else if(cur.getNodeType() == Node.TEXT_NODE || cur.getNodeType() == Node.CDATA_SECTION_NODE) { // I'm here. gist:129 // http://gist.github.com/144923 sep = "/"; name = "text()"; next = cur.getParentNode(); /* * Thumbler index computation. */ tmp = cur.getPreviousSibling(); while(tmp != null) { if(tmp.getNodeType() == Node.TEXT_NODE || tmp.getNodeType() == Node.CDATA_SECTION_NODE) { occur++; } tmp = tmp.getPreviousSibling(); } if(occur == 0) { tmp = cur.getNextSibling(); while(tmp != null && occur == 0) { if(tmp.getNodeType() == Node.TEXT_NODE || tmp.getNodeType() == Node.CDATA_SECTION_NODE) { occur++; } tmp = tmp.getNextSibling(); } } else { occur++; } } else if(cur.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) { sep = "/"; name = "processing-instruction('"+cur.getLocalName()+"')"; next = cur.getParentNode(); /* * Thumbler index computation. */ tmp = cur.getParentNode(); while(tmp != null) { if(tmp.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE && tmp.getLocalName().equals(cur.getLocalName())) { occur++; } tmp = tmp.getPreviousSibling(); } if(occur == 0) { tmp = cur.getNextSibling(); while(tmp != null && occur == 0) { if(tmp.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE && tmp.getLocalName().equals(cur.getLocalName())){ occur++; } tmp = tmp.getNextSibling(); } if(occur != 0) { occur = 1; } } else { occur++; } } else if(cur.getNodeType() == Node.ATTRIBUTE_NODE) { sep = "/@"; name = cur.getLocalName(); if(cur.getNamespaceURI() != null) { if(cur.getPrefix() != null) { name = cur.getPrefix() + ":" + name; } } next = ((Attr) cur).getOwnerElement(); } else { next = cur.getParentNode(); } if(occur == 0){ buffer = sep+name+buffer; } else { buffer = sep+name+"["+occur+"]"+buffer; } cur = next; } while(cur != null); return buffer; } protected static boolean compareTwoNodes(Node m, Node n) { return nodesAreEqual(m.getLocalName(), n.getLocalName()) && nodesAreEqual(m.getPrefix(), n.getPrefix()); } protected static boolean fullNamesMatch(Node a, Node b) { return a.getNodeName().equals(b.getNodeName()); } protected static String getFullName(Node n) { String lname = n.getLocalName(); String prefix = n.getPrefix(); if (lname != null) { if (prefix != null) return prefix + ":" + lname; else return lname; } else { return n.getNodeName(); } } private static boolean nodesAreEqual(Object a, Object b) { return (((a == null) && (a == null)) || (a != null) && (b != null) && (b.equals(a))); } private static Pattern encoded_pattern = Pattern.compile("&|>|<| "); private static Pattern decoded_pattern = Pattern.compile("&|>|<|\r"); private static String[] encoded = {"&", ">", "<", " "}; private static String[] decoded = {"&", ">", "<", "\r"}; private static String convert(Pattern ptn, String input, String[] oldChars, String[] newChars) { Matcher matcher = ptn.matcher(input); boolean result = matcher.find(); StringBuffer sb = new StringBuffer(); while(result) { String matched = matcher.group(); String replacement = ""; for (int i=0; i charsetNames = Charset.availableCharsets().keySet(); private static String ignoreInvalidEncoding(Ruby runtime, IRubyObject encoding) { String givenEncoding = rubyStringToString(encoding); if (charsetNames.contains(givenEncoding)) return givenEncoding; else return guessEncoding(); } public static String adjustSystemIdIfNecessary(String currentDir, String scriptFileName, String baseURI, String systemId) { if (systemId == null) return systemId; File file = new File(systemId); if (file.isAbsolute()) return systemId; String path = resolveSystemId(baseURI, systemId); if (path != null) return path; path = resolveSystemId(currentDir, systemId); if (path != null) return path; return resolveSystemId(scriptFileName, systemId); } private static String resolveSystemId(String baseName, String systemId) { if (baseName == null || baseName.length() < 1) return null; String parentName = null; baseName = baseName.replaceAll("%20", " "); File base = new File(baseName); if (base.isDirectory()) parentName = baseName; else parentName = base.getParent(); if (parentName == null) return null; if (parentName.toLowerCase().startsWith("file:")) parentName = parentName.substring("file:".length()); File dtdFile = new File(parentName + "/" + systemId); if (dtdFile.exists()) return dtdFile.getPath(); return null; } public static boolean isUTF8(String encoding) { if (encoding == null) return true; // no need to convert encoding int ret = Charset.forName(encoding).compareTo(Charset.forName("UTF-8")); return ret == 0; } public static byte[] convertEncoding(Charset output_charset, String input_string) throws CharacterCodingException { CharsetEncoder encoder = output_charset.newEncoder(); CharBuffer charBuffer = CharBuffer.wrap(input_string); ByteBuffer byteBuffer = encoder.encode(charBuffer); byte[] buffer = new byte[byteBuffer.remaining()]; byteBuffer.get(buffer); return buffer; } public static String convertEncodingByNKFIfNecessary(Ruby runtime, XmlDocument doc, String thing) { if (!(doc instanceof HtmlDocument)) return thing; String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding(); if (parsed_encoding == null) return thing; String ruby_encoding = rubyStringToString(doc.getEncoding()); if (ruby_encoding == null) return thing; if (Charset.forName(parsed_encoding).compareTo(Charset.forName(ruby_encoding)) == 0) { return thing; } else { return NokogiriHelpers.nkf(runtime, ruby_encoding, thing); } } // This method is used from HTML documents. HTML meta tag with encoding specification // might appear after non-ascii characters are used. For example, a title tag before // a meta tag. In such a case, Xerces encodes characters in UTF-8 without seeing meta tag. // Nokogiri uses NKF library to convert characters correct encoding. This means the method // works only for JIS/Shift_JIS/EUC-JP. public static String nkf(Ruby runtime, String ruby_encoding, String thing) { StringBuffer sb = new StringBuffer("-"); Charset that = Charset.forName(ruby_encoding); if (NokogiriHelpers.shift_jis.compareTo(that) == 0) { sb.append("S"); } else if (NokogiriHelpers.jis.compareTo(that) == 0) { sb.append("J"); } else if (NokogiriHelpers.euc_jp.compareTo(that) == 0) { sb.append("E"); } else { // should not come here. should be treated before this method. sb.append("W"); } sb.append("w"); Class nkfClass = null; try { // JRuby 1.7 and later nkfClass = runtime.getClassLoader().loadClass("org.jruby.ext.nkf.RubyNKF"); } catch (ClassNotFoundException e1) { try { // Before JRuby 1.7 nkfClass = runtime.getClassLoader().loadClass("org.jruby.RubyNKF"); } catch (ClassNotFoundException e2) { return thing; } } Method nkf_method; try { nkf_method = nkfClass.getMethod("nkf", ThreadContext.class, IRubyObject.class, IRubyObject.class, IRubyObject.class); RubyString r_str = (RubyString)nkf_method.invoke(null, runtime.getCurrentContext(), null, runtime.newString(new String(sb)), runtime.newString(thing)); return NokogiriHelpers.rubyStringToString(r_str); } catch (SecurityException e) { return thing; } catch (NoSuchMethodException e) { return thing; } catch (IllegalArgumentException e) { return thing; } catch (IllegalAccessException e) { return thing; } catch (InvocationTargetException e) { return thing; } } private static Charset shift_jis = Charset.forName("Shift_JIS"); private static Charset jis = Charset.forName("ISO-2022-JP"); private static Charset euc_jp = Charset.forName("EUC-JP"); public static boolean shouldEncode(Node text) { return text.getUserData(NokogiriHelpers.ENCODED_STRING) == null || !((Boolean)text.getUserData(NokogiriHelpers.ENCODED_STRING)); } public static boolean shouldDecode(Node text) { return !shouldEncode(text); } public static Node renameNode(Node n, String namespaceURI, String qualifiedName) throws DOMException { Document doc = n.getOwnerDocument(); XmlDocument xmlDoc = (XmlDocument)getCachedNode(doc); NokogiriNamespaceCache nsCache = xmlDoc.getNamespaceCache(); int oldHash = n.hashCode(); Node result = doc.renameNode(n, namespaceURI, qualifiedName); if (result != n) { nsCache.replaceNode(n, result); } return result; } }