# # xmlscan/scanner.rb # # Copyright (C) Ueno Katsuhiro 2002 # # $Id: xmlchar.rb,v 1.7 2003/04/30 03:03:35 katsu Exp $ # require 'xmlscan/scanner' module XMLScan module XMLChar # generated by samples/getxmlchar.rb char = [ 0x0009..0x0009, 0x000A..0x000A, 0x000D..0x000D, 0x0020..0xD7FF, 0xE000..0xFFFD, 0x10000..0x10FFFF, ] base_char = [ # for Letter 0x0041..0x005A, 0x0061..0x007A, 0x00C0..0x00D6, 0x00D8..0x00F6, 0x00F8..0x00FF, 0x0100..0x0131, 0x0134..0x013E, 0x0141..0x0148, 0x014A..0x017E, 0x0180..0x01C3, 0x01CD..0x01F0, 0x01F4..0x01F5, 0x01FA..0x0217, 0x0250..0x02A8, 0x02BB..0x02C1, 0x0386..0x0386, 0x0388..0x038A, 0x038C..0x038C, 0x038E..0x03A1, 0x03A3..0x03CE, 0x03D0..0x03D6, 0x03DA..0x03DA, 0x03DC..0x03DC, 0x03DE..0x03DE, 0x03E0..0x03E0, 0x03E2..0x03F3, 0x0401..0x040C, 0x040E..0x044F, 0x0451..0x045C, 0x045E..0x0481, 0x0490..0x04C4, 0x04C7..0x04C8, 0x04CB..0x04CC, 0x04D0..0x04EB, 0x04EE..0x04F5, 0x04F8..0x04F9, 0x0531..0x0556, 0x0559..0x0559, 0x0561..0x0586, 0x05D0..0x05EA, 0x05F0..0x05F2, 0x0621..0x063A, 0x0641..0x064A, 0x0671..0x06B7, 0x06BA..0x06BE, 0x06C0..0x06CE, 0x06D0..0x06D3, 0x06D5..0x06D5, 0x06E5..0x06E6, 0x0905..0x0939, 0x093D..0x093D, 0x0958..0x0961, 0x0985..0x098C, 0x098F..0x0990, 0x0993..0x09A8, 0x09AA..0x09B0, 0x09B2..0x09B2, 0x09B6..0x09B9, 0x09DC..0x09DD, 0x09DF..0x09E1, 0x09F0..0x09F1, 0x0A05..0x0A0A, 0x0A0F..0x0A10, 0x0A13..0x0A28, 0x0A2A..0x0A30, 0x0A32..0x0A33, 0x0A35..0x0A36, 0x0A38..0x0A39, 0x0A59..0x0A5C, 0x0A5E..0x0A5E, 0x0A72..0x0A74, 0x0A85..0x0A8B, 0x0A8D..0x0A8D, 0x0A8F..0x0A91, 0x0A93..0x0AA8, 0x0AAA..0x0AB0, 0x0AB2..0x0AB3, 0x0AB5..0x0AB9, 0x0ABD..0x0ABD, 0x0AE0..0x0AE0, 0x0B05..0x0B0C, 0x0B0F..0x0B10, 0x0B13..0x0B28, 0x0B2A..0x0B30, 0x0B32..0x0B33, 0x0B36..0x0B39, 0x0B3D..0x0B3D, 0x0B5C..0x0B5D, 0x0B5F..0x0B61, 0x0B85..0x0B8A, 0x0B8E..0x0B90, 0x0B92..0x0B95, 0x0B99..0x0B9A, 0x0B9C..0x0B9C, 0x0B9E..0x0B9F, 0x0BA3..0x0BA4, 0x0BA8..0x0BAA, 0x0BAE..0x0BB5, 0x0BB7..0x0BB9, 0x0C05..0x0C0C, 0x0C0E..0x0C10, 0x0C12..0x0C28, 0x0C2A..0x0C33, 0x0C35..0x0C39, 0x0C60..0x0C61, 0x0C85..0x0C8C, 0x0C8E..0x0C90, 0x0C92..0x0CA8, 0x0CAA..0x0CB3, 0x0CB5..0x0CB9, 0x0CDE..0x0CDE, 0x0CE0..0x0CE1, 0x0D05..0x0D0C, 0x0D0E..0x0D10, 0x0D12..0x0D28, 0x0D2A..0x0D39, 0x0D60..0x0D61, 0x0E01..0x0E2E, 0x0E30..0x0E30, 0x0E32..0x0E33, 0x0E40..0x0E45, 0x0E81..0x0E82, 0x0E84..0x0E84, 0x0E87..0x0E88, 0x0E8A..0x0E8A, 0x0E8D..0x0E8D, 0x0E94..0x0E97, 0x0E99..0x0E9F, 0x0EA1..0x0EA3, 0x0EA5..0x0EA5, 0x0EA7..0x0EA7, 0x0EAA..0x0EAB, 0x0EAD..0x0EAE, 0x0EB0..0x0EB0, 0x0EB2..0x0EB3, 0x0EBD..0x0EBD, 0x0EC0..0x0EC4, 0x0F40..0x0F47, 0x0F49..0x0F69, 0x10A0..0x10C5, 0x10D0..0x10F6, 0x1100..0x1100, 0x1102..0x1103, 0x1105..0x1107, 0x1109..0x1109, 0x110B..0x110C, 0x110E..0x1112, 0x113C..0x113C, 0x113E..0x113E, 0x1140..0x1140, 0x114C..0x114C, 0x114E..0x114E, 0x1150..0x1150, 0x1154..0x1155, 0x1159..0x1159, 0x115F..0x1161, 0x1163..0x1163, 0x1165..0x1165, 0x1167..0x1167, 0x1169..0x1169, 0x116D..0x116E, 0x1172..0x1173, 0x1175..0x1175, 0x119E..0x119E, 0x11A8..0x11A8, 0x11AB..0x11AB, 0x11AE..0x11AF, 0x11B7..0x11B8, 0x11BA..0x11BA, 0x11BC..0x11C2, 0x11EB..0x11EB, 0x11F0..0x11F0, 0x11F9..0x11F9, 0x1E00..0x1E9B, 0x1EA0..0x1EF9, 0x1F00..0x1F15, 0x1F18..0x1F1D, 0x1F20..0x1F45, 0x1F48..0x1F4D, 0x1F50..0x1F57, 0x1F59..0x1F59, 0x1F5B..0x1F5B, 0x1F5D..0x1F5D, 0x1F5F..0x1F7D, 0x1F80..0x1FB4, 0x1FB6..0x1FBC, 0x1FBE..0x1FBE, 0x1FC2..0x1FC4, 0x1FC6..0x1FCC, 0x1FD0..0x1FD3, 0x1FD6..0x1FDB, 0x1FE0..0x1FEC, 0x1FF2..0x1FF4, 0x1FF6..0x1FFC, 0x2126..0x2126, 0x212A..0x212B, 0x212E..0x212E, 0x2180..0x2182, 0x3041..0x3094, 0x30A1..0x30FA, 0x3105..0x312C, 0xAC00..0xD7A3, ] ideographic = [ # for Letter 0x3007..0x3007, 0x3021..0x3029, 0x4E00..0x9FA5, ] combining_char = [ # for NameChar 0x0300..0x0345, 0x0360..0x0361, 0x0483..0x0486, 0x0591..0x05A1, 0x05A3..0x05B9, 0x05BB..0x05BD, 0x05BF..0x05BF, 0x05C1..0x05C2, 0x05C4..0x05C4, 0x064B..0x0652, 0x0670..0x0670, 0x06D6..0x06DC, 0x06DD..0x06DF, 0x06E0..0x06E4, 0x06E7..0x06E8, 0x06EA..0x06ED, 0x0901..0x0903, 0x093C..0x093C, 0x093E..0x094C, 0x094D..0x094D, 0x0951..0x0954, 0x0962..0x0963, 0x0981..0x0983, 0x09BC..0x09BC, 0x09BE..0x09BE, 0x09BF..0x09BF, 0x09C0..0x09C4, 0x09C7..0x09C8, 0x09CB..0x09CD, 0x09D7..0x09D7, 0x09E2..0x09E3, 0x0A02..0x0A02, 0x0A3C..0x0A3C, 0x0A3E..0x0A3E, 0x0A3F..0x0A3F, 0x0A40..0x0A42, 0x0A47..0x0A48, 0x0A4B..0x0A4D, 0x0A70..0x0A71, 0x0A81..0x0A83, 0x0ABC..0x0ABC, 0x0ABE..0x0AC5, 0x0AC7..0x0AC9, 0x0ACB..0x0ACD, 0x0B01..0x0B03, 0x0B3C..0x0B3C, 0x0B3E..0x0B43, 0x0B47..0x0B48, 0x0B4B..0x0B4D, 0x0B56..0x0B57, 0x0B82..0x0B83, 0x0BBE..0x0BC2, 0x0BC6..0x0BC8, 0x0BCA..0x0BCD, 0x0BD7..0x0BD7, 0x0C01..0x0C03, 0x0C3E..0x0C44, 0x0C46..0x0C48, 0x0C4A..0x0C4D, 0x0C55..0x0C56, 0x0C82..0x0C83, 0x0CBE..0x0CC4, 0x0CC6..0x0CC8, 0x0CCA..0x0CCD, 0x0CD5..0x0CD6, 0x0D02..0x0D03, 0x0D3E..0x0D43, 0x0D46..0x0D48, 0x0D4A..0x0D4D, 0x0D57..0x0D57, 0x0E31..0x0E31, 0x0E34..0x0E3A, 0x0E47..0x0E4E, 0x0EB1..0x0EB1, 0x0EB4..0x0EB9, 0x0EBB..0x0EBC, 0x0EC8..0x0ECD, 0x0F18..0x0F19, 0x0F35..0x0F35, 0x0F37..0x0F37, 0x0F39..0x0F39, 0x0F3E..0x0F3E, 0x0F3F..0x0F3F, 0x0F71..0x0F84, 0x0F86..0x0F8B, 0x0F90..0x0F95, 0x0F97..0x0F97, 0x0F99..0x0FAD, 0x0FB1..0x0FB7, 0x0FB9..0x0FB9, 0x20D0..0x20DC, 0x20E1..0x20E1, 0x302A..0x302F, 0x3099..0x3099, 0x309A..0x309A, ] digit = [ # for NameChar 0x0030..0x0039, 0x0660..0x0669, 0x06F0..0x06F9, 0x0966..0x096F, 0x09E6..0x09EF, 0x0A66..0x0A6F, 0x0AE6..0x0AEF, 0x0B66..0x0B6F, 0x0BE7..0x0BEF, 0x0C66..0x0C6F, 0x0CE6..0x0CEF, 0x0D66..0x0D6F, 0x0E50..0x0E59, 0x0ED0..0x0ED9, 0x0F20..0x0F29, ] extender = [ # for NameChar 0x00B7..0x00B7, 0x02D0..0x02D0, 0x02D1..0x02D1, 0x0387..0x0387, 0x0640..0x0640, 0x0E46..0x0E46, 0x0EC6..0x0EC6, 0x3005..0x3005, 0x3031..0x3035, 0x309D..0x309E, 0x30FC..0x30FE, ] letter = base_char + ideographic Char = char NameChar = [ 0x2d..0x2e, 0x3a..0x3a, 0x5f..0x5f ] + letter + combining_char + digit + extender NameFirstChar = [ 0x3a..0x3a, 0x5f..0x5f ] + letter [ Char, NameChar, NameFirstChar ].each { |i| i.sort! { |a,b| a.begin <=> b.begin } } if Regexp.new("[\xc2\x80-\xc4\x80]", nil, 'U') =~ "\xc4\x81" then # regexp engine is buggy ;p buggy_regexp = true else buggy_regexp = false end o = Object.new class << o def charclass(a) a.collect { |i| b, e = i.begin, i.end if b == e then [b].pack('U') elsif b + 1 == e then [b,e].pack('UU') elsif b < 0x80 and e >= 0x80 then "#{b.chr}-\x7f" + [0x80,?-,e].pack('UCU') else [b,?-,e].pack('UCU') end }.join.sub(/\A-/, '\\\\-') end def make_regexp(a) "[#{charclass(a)}]" end def make_neg_regexp(a) "[^#{charclass(a)}]" end end if buggy_regexp then class << o remove_method :make_regexp def make_regexp(a) b = [] a.each { |r| if r.begin < 0x80 and r.end >= 0x80 then b.push r.begin..0x7f r = 0x80..r.end end if r.begin < 0x100 and r.end >= 0x100 then b.push r.begin..0xff r = 0x100..r.end end b.push r } mbc8 = b.select { |r| r.begin >= 0x80 and r.begin <= 0xff } a = b.reject { |r| r.begin >= 0x80 and r.begin <= 0xff } if mbc8.empty? then "[#{charclass(a)}]" else dst = "(?:[#{charclass(a)}]" mbc8.each { |r| r.each { |i| dst << [?|, i].pack('CU') } } dst << ")" end end end end CharPattern = Regexp.new("\\A#{o.make_regexp(Char)}*\\z", nil, 'U') NotCharPattern = Regexp.new(o.make_neg_regexp(Char), nil, 'U') NmtokenPattern = Regexp.new("\\A#{o.make_regexp(NameChar)}+\\z", nil, 'U') NotNameCharPattern = Regexp.new(o.make_neg_regexp(NameChar), nil, 'U') NamePattern = Regexp.new('\A' + o.make_regexp(NameFirstChar) + o.make_regexp(NameChar) + '*\z', nil, 'U') def valid_char?(code) NotCharPattern !~ [code].pack('U') end def valid_chardata?(str) NotCharPattern !~ str end def valid_nmtoken?(str) NotNameCharPattern !~ str end def valid_name?(str) not NamePattern !~ str end if buggy_regexp then remove_method :valid_char? remove_method :valid_chardata? remove_method :valid_nmtoken? def valid_char?(code) not CharPattern !~ [code].pack('U') end def valid_chardata?(str) not CharPattern !~ str end def valid_nmtoken?(str) not NmtokenPattern !~ str end end module_function :valid_char?, :valid_chardata? module_function :valid_nmtoken?, :valid_name? def valid_pubid?(str) /[^\- \r\na-zA-Z0-9'()+,.\/:=?;!*#\@$_%]/u !~ str end def valid_version?(str) /[^\-a-zA-Z0-9_.:]/u !~ str end module_function :valid_version? def valid_encoding?(str) if /\A[A-Za-z]([\-A-Za-z0-9._])*\z/u =~ str then true else false end end module_function :valid_encoding? end class XMLScanner module StrictChar include XMLChar private def check_valid_name(name) unless valid_name? name then parse_error "`#{name}' is not valid for XML name" end end def check_valid_chardata(str) unless valid_chardata? str then parse_error "invlalid XML character is found" end end def check_valid_char(code) unless valid_char? code then wellformed_error "#{code} is not a valid XML character" end end def check_valid_version(str) unless valid_version? str then parse_error "#{str} is not a valid XML version" end end def check_valid_encoding(str) unless valid_encoding? str then parse_error "#{str} is not a valid XML encoding name" end end def check_valid_pubid(str) unless valid_pubid? str then parse_error "#{str} is not a valid public ID" end end def on_xmldecl_version(str) check_valid_version str super end def on_xmldecl_encoding(str) check_valid_encoding str super end def on_xmldecl_standalone(str) check_valid_chardata str super end def on_doctype(root, pubid, sysid) check_valid_name root check_valid_pubid pubid if pubid check_valid_chardata sysid if sysid super end def on_comment(str) check_valid_chardata str super end def on_pi(target, pi) check_valid_name target check_valid_chardata pi super end def on_chardata(str) check_valid_chardata str super end def on_cdata(str) check_valid_chardata str super end def on_etag(name) check_valid_name name super end def on_entityref(ref) check_valid_name ref super end def on_charref(code) check_valid_char code super end def on_charref_hex(code) check_valid_char code super end def on_stag(name) check_valid_name name super end def on_attribute(name) check_valid_name name super end def on_attr_value(str) check_valid_chardata str super end def on_attr_entityref(ref) check_valid_name ref super end def on_attr_charref(code) check_valid_char code super end def on_attr_charref_hex(code) check_valid_char code super end end private def apply_option_strict_char extend StrictChar end end end if $0 == __FILE__ then class TestVisitor include XMLScan::Visitor def parse_error(msg) STDERR.printf("%s:%d: %s\n", $s.path, $s.lineno, msg) if $VERBOSE end def wellformed_error(msg) STDERR.printf("%s:%d: WFC: %s\n", $s.path, $s.lineno, msg) if $VERBOSE end end $s = scan = XMLScan::XMLScanner.new(TestVisitor.new, :strict_char) src = ARGF def src.path; filename; end t1 = Time.times.utime scan.parse src t2 = Time.times.utime STDERR.printf "%2.3f sec\n", t2 - t1 end