util.rb in sup-0.10

- old
+ new

@@ -175,11 +175,11 @@
 
 class String
   ## nasty multibyte hack for ruby 1.8. if it's utf-8, split into chars using
   ## the utf8 regex and count those. otherwise, use the byte length.
   def display_length
-    if $encoding == "UTF-8" || $encoding == "utf8"
+    if RUBY_VERSION < '1.9.1' && ($encoding == "UTF-8" || $encoding == "utf8")
       scan(/./u).size
     else
       size
     end
   end
@@ -288,16 +288,49 @@
     def ord
       self[0]
     end
   end
 
+  unless method_defined? :each
+    def each &b
+      each_line &b
+    end
+  end
+
   ## takes a list of words, and returns an array of symbols.  typically used in
   ## Sup for translating Ferret's representation of a list of labels (a string)
   ## to an array of label symbols.
   ##
   ## split_on will be passed to String#split, so you can leave this nil for space.
   def to_set_of_symbols split_on=nil; Set.new split(split_on).map { |x| x.strip.intern } end
+
+  class CheckError < ArgumentError; end
+  def check
+    begin
+      fail "unexpected encoding #{encoding}" if respond_to?(:encoding) && !(encoding == Encoding::UTF_8 || encoding == Encoding::ASCII)
+      fail "invalid encoding" if respond_to?(:valid_encoding?) && !valid_encoding?
+    rescue
+      raise CheckError.new($!.message)
+    end
+  end
+
+  def ascii
+    out = ""
+    each_byte do |b|
+      if (b & 128) != 0
+        out << "\\x#{b.to_s 16}"
+      else
+        out << b.chr
+      end
+    end
+    out.force_encoding Encoding::UTF_8 if out.respond_to? :force_encoding
+    out
+  end
+
+  def transcode src_encoding=$encoding
+    Iconv.easy_decode $encoding, src_encoding, self
+  end
 end
 
 class Numeric
   def clamp min, max
     if self < min
@@ -484,13 +517,13 @@
       strftime "%b %Y"
     elsif month != from.month
       strftime "%b %e"
     else
       if is_the_same_day? from
-        strftime("%l:%M%P")
+        strftime("%l:%M%p").downcase # emulate %P (missing on ruby 1.8 darwin)
       elsif is_the_day_before? from
-        "Yest."  + nearest_hour.strftime("%l%P")
+        "Yest."  + nearest_hour.strftime("%l%p").downcase # emulate %P
       else
         strftime "%b %e"
       end
     end
   end
@@ -639,23 +672,28 @@
     @m.synchronize { !@over && @over = true }
   end
 end
 
 class Iconv
-  def self.easy_decode target, charset, text
-    return text if charset =~ /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i
-    charset = case charset
+  def self.easy_decode target, orig_charset, text
+    if text.respond_to? :force_encoding
+      text = text.dup
+      text.force_encoding Encoding::BINARY
+    end
+    charset = case orig_charset
       when /UTF[-_ ]?8/i then "utf-8"
       when /(iso[-_ ])?latin[-_ ]?1$/i then "ISO-8859-1"
       when /iso[-_ ]?8859[-_ ]?15/i then 'ISO-8859-15'
       when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i then "utf-7"
-      else charset
+      when /^euc$/i then 'EUC-JP' # XXX try them all?
+      when /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i then 'ASCII'
+      else orig_charset
     end
 
     begin
-      Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2]
-    rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence => e
-      warn "couldn't transcode text from #{charset} to #{target} (\"#{text[0 ... 20]}\"...) (got #{e.message}); using original as is"
-      text
+      returning(Iconv.iconv(target, charset, text + " ").join[0 .. -2]) { |str| str.check }
+    rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence, String::CheckError
+      debug "couldn't transcode text from #{orig_charset} (#{charset}) to #{target}) (#{text[0 ... 20].inspect}...) (got #{$!.message} (#{$!.class}))"
+      text.ascii
     end
   end
 end