util.rb in sup-0.14.0

- old
+ new

@@ -1,13 +1,15 @@
+# encoding: utf-8
+
 require 'thread'
 require 'lockfile'
 require 'mime/types'
 require 'pathname'
 require 'set'
 require 'enumerator'
 require 'benchmark'
-require 'iconv'
+require 'unicode'
 
 ## time for some monkeypatching!
 class Symbol
   unless method_defined? :to_proc
     def to_proc
@@ -29,11 +31,11 @@
   end
 
   def dump_lock_id lock_id = @lock_id
       "host: %s\npid: %s\nppid: %s\ntime: %s\nuser: %s\npname: %s\n" %
         lock_id.values_at('host','pid','ppid','time','user', 'pname')
-    end
+  end
 
   def lockinfo_on_disk
     h = load_lock_id IO.read(path)
     h['mtime'] = File.mtime path
     h['path'] = path
@@ -112,10 +114,29 @@
       # end
     end
   end
 
   class Header
+
+    # Convert to ASCII before trying to match with regexp
+    class Field
+
+      EXTRACT_FIELD_NAME_RE = /\A([^\x00-\x1f\x7f-\xff :]+):\s*/no
+
+      class << self
+        def parse(field)
+          field = field.dup.to_s
+          field = field.fix_encoding.ascii
+          if field =~ EXTRACT_FIELD_NAME_RE
+            [ $1, $'.chomp ]
+          else
+            [ "", Field.value_strip(field) ]
+          end
+        end
+      end
+    end
+
     ## Be more cautious about invalid content-type headers
     ## the original RMail code calls
     ## value.strip.split(/\s*;\s*/)[0].downcase
     ## without checking if split returned an element
 
@@ -232,18 +253,18 @@
     ret
   end
 end
 
 class String
-  ## nasty multibyte hack for ruby 1.8. if it's utf-8, split into chars using
-  ## the utf8 regex and count those. otherwise, use the byte length.
   def display_length
-    if RUBY_VERSION < '1.9.1' && ($encoding == "UTF-8" || $encoding == "utf8")
-      # scan hack is somewhat slow, worth trying to cache
-      @display_length ||= scan(/./u).size
-    else
-      size
+    @display_length ||= Unicode.width(self, false)
+  end
+
+  def slice_by_display_length len
+    each_char.each_with_object "" do |c, buffer|
+      len -= c.display_length
+      buffer << c if len >= 0
     end
   end
 
   def camel_to_hyphy
     self.gsub(/([a-z])([A-Z0-9])/, '\1-\2').downcase
@@ -326,24 +347,73 @@
   end
 
   def wrap len
     ret = []
     s = self
-    while s.length > len
-      cut = s[0 ... len].rindex(/\s/)
+    while s.display_length > len
+      cut = s.slice_by_display_length(len).rindex(/\s/)
       if cut
         ret << s[0 ... cut]
         s = s[(cut + 1) .. -1]
       else
-        ret << s[0 ... len]
-        s = s[len .. -1]
+        ret << s.slice_by_display_length(len)
+        s = s[ret.last.length .. -1]
       end
     end
     ret << s
   end
 
+  # Fix the damn string! make sure it is valid utf-8, then convert to
+  # user encoding.
+  #
+  # Not Ruby 1.8 compatible
+  def fix_encoding
+    # first try to encode to utf-8 from whatever current encoding
+    encode!('UTF-8', :invalid => :replace, :undef => :replace)
+
+    # do this anyway in case string is set to be UTF-8, encoding to
+    # something else (UTF-16 which can fully represent UTF-8) and back
+    # ensures invalid chars are replaced.
+    encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace)
+    encode!('UTF-8', 'UTF-16', :invalid => :replace, :undef => :replace)
+
+    fail "Could not create valid UTF-8 string out of: '#{self.to_s}'." unless valid_encoding?
+
+    # now convert to $encoding
+    encode!($encoding, :invalid => :replace, :undef => :replace)
+
+    fail "Could not create valid #{$encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?
+
+    self
+  end
+
+  # transcode the string if original encoding is know
+  # fix if broken.
+  #
+  # Not Ruby 1.8 compatible
+  def transcode to_encoding, from_encoding
+    begin
+      encode!(to_encoding, from_encoding, :invalid => :replace, :undef => :replace)
+
+      unless valid_encoding?
+        # fix encoding (through UTF-8)
+        encode!('UTF-16', from_encoding, :invalid => :replace, :undef => :replace)
+        encode!(to_encoding, 'UTF-16', :invalid => :replace, :undef => :replace)
+      end
+
+    rescue Encoding::ConverterNotFoundError
+      debug "Encoding converter not found for #{from_encoding.inspect} or #{to_encoding.inspect}, fixing string: '#{self.to_s}', but expect weird characters."
+      fix_encoding
+    end
+
+    fail "Could not create valid #{to_encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?
+
+    self
+  end
+
   def normalize_whitespace
+    fix_encoding
     gsub(/\t/, "    ").gsub(/\r/, "")
   end
 
   unless method_defined? :ord
     def ord
@@ -381,18 +451,14 @@
         out << "\\x#{b.to_s 16}"
       else
         out << b.chr
       end
     end
-    out.force_encoding Encoding::UTF_8 if out.respond_to? :force_encoding
-    out
+    out = out.fix_encoding # this should now be an utf-8 string of ascii
+                           # compat chars.
   end
 
-  def transcode src_encoding=$encoding
-    Iconv.easy_decode $encoding, src_encoding, self
-  end
-
   unless method_defined? :ascii_only?
     def ascii_only?
       size.times { |i| return false if self[i] & 128 != 0 }
       return true
     end
@@ -657,29 +723,5 @@
   def winner?
     @m.synchronize { !@over && @over = true }
   end
 end
 
-class Iconv
-  def self.easy_decode target, orig_charset, text
-    if text.respond_to? :force_encoding
-      text = text.dup
-      text.force_encoding Encoding::BINARY
-    end
-    charset = case orig_charset
-      when /UTF[-_ ]?8/i then "utf-8"
-      when /(iso[-_ ])?latin[-_ ]?1$/i then "ISO-8859-1"
-      when /iso[-_ ]?8859[-_ ]?15/i then 'ISO-8859-15'
-      when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i then "utf-7"
-      when /^euc$/i then 'EUC-JP' # XXX try them all?
-      when /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i then 'ASCII'
-      else orig_charset
-    end
-
-    begin
-      returning(Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2]) { |str| str.check }
-    rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence, String::CheckError
-      debug "couldn't transcode text from #{orig_charset} (#{charset}) to #{target} (#{text[0 ... 20].inspect}...): got #{$!.class} (#{$!.message})"
-      text.ascii
-    end
-  end
-end