package nokogiri; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.xerces.parsers.AbstractSAXParser; import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; import org.jruby.RubyFixnum; import org.jruby.RubyString; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.xml.sax.SAXException; import nokogiri.internals.NokogiriHandler; import static nokogiri.internals.NokogiriHelpers.rubyStringToString; /** * Class for Nokogiri::HTML4::SAX::ParserContext. * * @author serabe * @author Patrick Mahoney <pat@polycrystal.org> * @author Yoko Harada <yokolet@gmail.com> */ @JRubyClass(name = "Nokogiri::HTML4::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext") public class Html4SaxParserContext extends XmlSaxParserContext { private static final long serialVersionUID = 1L; static Html4SaxParserContext newInstance(final Ruby runtime, final RubyClass klazz) { Html4SaxParserContext instance = new Html4SaxParserContext(runtime, klazz); instance.initialize(runtime); return instance; } public Html4SaxParserContext(Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } @Override protected AbstractSAXParser createParser() throws SAXException { SAXParser parser = new SAXParser(); try { parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "lower"); // NekoHTML should not try to guess the encoding based on the meta // tags or other information in the document. This is already // handled by the EncodingReader. parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true); return parser; } catch (SAXException ex) { throw new SAXException( "Problem while creating HTML4 SAX Parser: " + ex.toString()); } } @JRubyMethod(name = "memory", meta = true) public static IRubyObject parse_memory(ThreadContext context, IRubyObject klazz, IRubyObject data, IRubyObject encoding) { Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding); ByteArrayInputStream istream = new ByteArrayInputStream(input.toString().getBytes()); ctx.setInputSource(istream); ctx.getInputSource().setEncoding(javaEncoding); } return ctx; } public enum EncodingType { NONE(0, "NONE"), UTF_8(1, "UTF-8"), UTF16LE(2, "UTF16LE"), UTF16BE(3, "UTF16BE"), UCS4LE(4, "UCS4LE"), UCS4BE(5, "UCS4BE"), EBCDIC(6, "EBCDIC"), UCS4_2143(7, "ICS4-2143"), UCS4_3412(8, "UCS4-3412"), UCS2(9, "UCS2"), ISO_8859_1(10, "ISO-8859-1"), ISO_8859_2(11, "ISO-8859-2"), ISO_8859_3(12, "ISO-8859-3"), ISO_8859_4(13, "ISO-8859-4"), ISO_8859_5(14, "ISO-8859-5"), ISO_8859_6(15, "ISO-8859-6"), ISO_8859_7(16, "ISO-8859-7"), ISO_8859_8(17, "ISO-8859-8"), ISO_8859_9(18, "ISO-8859-9"), ISO_2022_JP(19, "ISO-2022-JP"), SHIFT_JIS(20, "SHIFT-JIS"), EUC_JP(21, "EUC-JP"), ASCII(22, "ASCII"); private final int value; private final String name; EncodingType(int value, String name) { this.value = value; this.name = name; } public int getValue() { return value; } public String toString() { return name; } private static transient EncodingType[] values; // NOTE: assuming ordinal == value static EncodingType get(final int ordinal) { EncodingType[] values = EncodingType.values; if (values == null) { values = EncodingType.values(); EncodingType.values = values; } if (ordinal >= 0 && ordinal < values.length) { return values[ordinal]; } return null; } } private static String findEncodingName(final int value) { EncodingType type = EncodingType.get(value); if (type == null) { return null; } assert type.value == value; return type.name; } private static String findEncodingName(ThreadContext context, IRubyObject encoding) { String rubyEncoding = null; if (encoding instanceof RubyString) { rubyEncoding = rubyStringToString((RubyString) encoding); } else if (encoding instanceof RubyFixnum) { rubyEncoding = findEncodingName(RubyFixnum.fix2int((RubyFixnum) encoding)); } if (rubyEncoding == null) { return null; } try { return Charset.forName(rubyEncoding).displayName(); } catch (UnsupportedCharsetException e) { throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported"); } catch (IllegalCharsetNameException e) { throw context.getRuntime().newEncodingError(e.getMessage()); } } private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+", Pattern.CASE_INSENSITIVE); private static CharSequence applyEncoding(final String input, final String enc) { int start_pos = 0; int end_pos = 0; if (containsIgnoreCase(input, "charset")) { Matcher m = CHARSET_PATTERN.matcher(input); while (m.find()) { start_pos = m.start(); end_pos = m.end(); } } if (start_pos != end_pos) { return new StringBuilder(input).replace(start_pos, end_pos, "charset=" + enc); } return input; } private static boolean containsIgnoreCase(final String str, final String sub) { final int len = sub.length(); final int max = str.length() - len; if (len == 0) { return true; } final char c0Lower = Character.toLowerCase(sub.charAt(0)); final char c0Upper = Character.toUpperCase(sub.charAt(0)); for (int i = 0; i <= max; i++) { final char ch = str.charAt(i); if (ch != c0Lower && Character.toLowerCase(ch) != c0Lower && Character.toUpperCase(ch) != c0Upper) { continue; // first char doesn't match } if (str.regionMatches(true, i + 1, sub, 0 + 1, len - 1)) { return true; } } return false; } @JRubyMethod(name = "file", meta = true) public static IRubyObject parse_file(ThreadContext context, IRubyObject klass, IRubyObject data, IRubyObject encoding) { if (!(data instanceof RubyString)) { throw context.getRuntime().newTypeError("data must be kind_of String"); } if (!(encoding instanceof RubyString)) { throw context.getRuntime().newTypeError("data must be kind_of String"); } Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); ctx.setInputSourceFile(context, data); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { ctx.getInputSource().setEncoding(javaEncoding); } return ctx; } @JRubyMethod(name = "io", meta = true) public static IRubyObject parse_io(ThreadContext context, IRubyObject klass, IRubyObject data, IRubyObject encoding) { if (!(encoding instanceof RubyFixnum)) { throw context.getRuntime().newTypeError("encoding must be kind_of String"); } Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); ctx.setIOInputSource(context, data, context.nil); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { ctx.getInputSource().setEncoding(javaEncoding); } return ctx; } /** * Create a new parser context that will read from a raw input stream. * Meant to be run in a separate thread by Html4SaxPushParser. */ static Html4SaxParserContext parse_stream(final Ruby runtime, RubyClass klass, InputStream stream) { Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(runtime, klass); ctx.setInputSource(stream); return ctx; } @Override protected void preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) { // this function is meant to be empty. It overrides the one in XmlSaxParserContext } }