mkunidata.rb in unicode-0.2.0

- old
+ new

@@ -1,15 +1,15 @@
 #! /usr/local/bin/ruby -KU
 
-if $KCODE != 'UTF8'
-  raise "$KCODE must be UTF8"
-end
+#if $KCODE != 'UTF8'
+#  raise "$KCODE must be UTF8"
+#end
 
 HEAD=<<EOS
 /*
  * UnicodeData
- * 1999 by yoshidam
+ * Copyright 1999, 2004 by yoshidam
  *
  */
 
 #ifndef _UNIDATA_MAP
 #define _UNIDATA_MAP
@@ -23,11 +23,11 @@
   const int uppercase;
   const int lowercase;
   const int titlecase;
 };
 
-const static struct unicode_data unidata[] = {
+static const struct unicode_data unidata[] = {
 EOS
 
 TAIL=<<EOS
 };
 
@@ -39,11 +39,11 @@
     return [nil, nil]
   end
   canon = ""
   compat = ""
   chars = hex.split(" ")
-  if chars[0] =~ /^[0-9A-F]{4}$/
+  if chars[0] =~ /^[0-9A-F]{4,6}$/
     chars.each do |c|
       canon << [c.hex].pack("U")
     end
     compat = canon
   elsif chars[0] =~ /^<.+>$/
@@ -57,11 +57,11 @@
   end
   [canon, compat]
 end
 
 def hex_or_nil(str)
-  return "-1" if str.nil?
+  return "-1" if str.nil? || str == ''
   return format("0x%04x", str.hex)
 end
 
 def printstr(str)
   return "NULL" if !str
@@ -79,23 +79,33 @@
 ## scan Composition Exclusions
 exclusion = {}
 open(ARGV[1]) do |f|
   while l = f.gets
     next if l =~ /^\#/ || l =~ /^$/
+    next if l !~ /Full_Composition_Exclusion/
     code, = l.split(/\s/)
-    code = code.hex
-    exclusion[code] = true
+    if code =~ /^[0-9A-F]+$/
+      code = code.hex
+      exclusion[code] = true
+    elsif code =~ /^([0-9A-F]+)\.\.([0-9A-F]+)$/
+#      p [$1, $2]
+      scode = $1.hex
+      ecode = $2.hex
+      for code in scode..ecode
+        exclusion[code] = true
+      end
+    end
   end
 end
 
 ## scan UnicodeData
 udata = {}
 open(ARGV[0]) do |f|
   while l = f.gets
     l.chomp!
     code, charname, gencat, ccclass, bidicat,decomp,
       dec, digit, num, mirror, uni1_0, comment, upcase,
-      lowcase, titlecase = l.split(";");
+      lowcase, titlecase = l.split(";", 15);
     code = code.hex
     ccclass = ccclass.to_i
     canon, compat = hex2str(decomp)
     upcase = hex_or_nil(upcase)
     lowcase = hex_or_nil(lowcase)