lib/sup/util.rb in sup-0.13.2.1 vs lib/sup/util.rb in sup-0.14.0

- old
+ new

@@ -1,13 +1,15 @@ +# encoding: utf-8 + require 'thread' require 'lockfile' require 'mime/types' require 'pathname' require 'set' require 'enumerator' require 'benchmark' -require 'iconv' +require 'unicode' ## time for some monkeypatching! class Symbol unless method_defined? :to_proc def to_proc @@ -29,11 +31,11 @@ end def dump_lock_id lock_id = @lock_id "host: %s\npid: %s\nppid: %s\ntime: %s\nuser: %s\npname: %s\n" % lock_id.values_at('host','pid','ppid','time','user', 'pname') - end + end def lockinfo_on_disk h = load_lock_id IO.read(path) h['mtime'] = File.mtime path h['path'] = path @@ -112,10 +114,29 @@ # end end end class Header + + # Convert to ASCII before trying to match with regexp + class Field + + EXTRACT_FIELD_NAME_RE = /\A([^\x00-\x1f\x7f-\xff :]+):\s*/no + + class << self + def parse(field) + field = field.dup.to_s + field = field.fix_encoding.ascii + if field =~ EXTRACT_FIELD_NAME_RE + [ $1, $'.chomp ] + else + [ "", Field.value_strip(field) ] + end + end + end + end + ## Be more cautious about invalid content-type headers ## the original RMail code calls ## value.strip.split(/\s*;\s*/)[0].downcase ## without checking if split returned an element @@ -232,18 +253,18 @@ ret end end class String - ## nasty multibyte hack for ruby 1.8. if it's utf-8, split into chars using - ## the utf8 regex and count those. otherwise, use the byte length. def display_length - if RUBY_VERSION < '1.9.1' && ($encoding == "UTF-8" || $encoding == "utf8") - # scan hack is somewhat slow, worth trying to cache - @display_length ||= scan(/./u).size - else - size + @display_length ||= Unicode.width(self, false) + end + + def slice_by_display_length len + each_char.each_with_object "" do |c, buffer| + len -= c.display_length + buffer << c if len >= 0 end end def camel_to_hyphy self.gsub(/([a-z])([A-Z0-9])/, '\1-\2').downcase @@ -326,24 +347,73 @@ end def wrap len ret = [] s = self - while s.length > len - cut = s[0 ... len].rindex(/\s/) + while s.display_length > len + cut = s.slice_by_display_length(len).rindex(/\s/) if cut ret << s[0 ... cut] s = s[(cut + 1) .. -1] else - ret << s[0 ... len] - s = s[len .. -1] + ret << s.slice_by_display_length(len) + s = s[ret.last.length .. -1] end end ret << s end + # Fix the damn string! make sure it is valid utf-8, then convert to + # user encoding. + # + # Not Ruby 1.8 compatible + def fix_encoding + # first try to encode to utf-8 from whatever current encoding + encode!('UTF-8', :invalid => :replace, :undef => :replace) + + # do this anyway in case string is set to be UTF-8, encoding to + # something else (UTF-16 which can fully represent UTF-8) and back + # ensures invalid chars are replaced. + encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace) + encode!('UTF-8', 'UTF-16', :invalid => :replace, :undef => :replace) + + fail "Could not create valid UTF-8 string out of: '#{self.to_s}'." unless valid_encoding? + + # now convert to $encoding + encode!($encoding, :invalid => :replace, :undef => :replace) + + fail "Could not create valid #{$encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding? + + self + end + + # transcode the string if original encoding is know + # fix if broken. + # + # Not Ruby 1.8 compatible + def transcode to_encoding, from_encoding + begin + encode!(to_encoding, from_encoding, :invalid => :replace, :undef => :replace) + + unless valid_encoding? + # fix encoding (through UTF-8) + encode!('UTF-16', from_encoding, :invalid => :replace, :undef => :replace) + encode!(to_encoding, 'UTF-16', :invalid => :replace, :undef => :replace) + end + + rescue Encoding::ConverterNotFoundError + debug "Encoding converter not found for #{from_encoding.inspect} or #{to_encoding.inspect}, fixing string: '#{self.to_s}', but expect weird characters." + fix_encoding + end + + fail "Could not create valid #{to_encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding? + + self + end + def normalize_whitespace + fix_encoding gsub(/\t/, " ").gsub(/\r/, "") end unless method_defined? :ord def ord @@ -381,18 +451,14 @@ out << "\\x#{b.to_s 16}" else out << b.chr end end - out.force_encoding Encoding::UTF_8 if out.respond_to? :force_encoding - out + out = out.fix_encoding # this should now be an utf-8 string of ascii + # compat chars. end - def transcode src_encoding=$encoding - Iconv.easy_decode $encoding, src_encoding, self - end - unless method_defined? :ascii_only? def ascii_only? size.times { |i| return false if self[i] & 128 != 0 } return true end @@ -657,29 +723,5 @@ def winner? @m.synchronize { !@over && @over = true } end end -class Iconv - def self.easy_decode target, orig_charset, text - if text.respond_to? :force_encoding - text = text.dup - text.force_encoding Encoding::BINARY - end - charset = case orig_charset - when /UTF[-_ ]?8/i then "utf-8" - when /(iso[-_ ])?latin[-_ ]?1$/i then "ISO-8859-1" - when /iso[-_ ]?8859[-_ ]?15/i then 'ISO-8859-15' - when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i then "utf-7" - when /^euc$/i then 'EUC-JP' # XXX try them all? - when /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i then 'ASCII' - else orig_charset - end - - begin - returning(Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2]) { |str| str.check } - rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence, String::CheckError - debug "couldn't transcode text from #{orig_charset} (#{charset}) to #{target} (#{text[0 ... 20].inspect}...): got #{$!.class} (#{$!.message})" - text.ascii - end - end -end