lib/sup/util.rb in sup-0.13.2.1 vs lib/sup/util.rb in sup-0.14.0
- old
+ new
@@ -1,13 +1,15 @@
+# encoding: utf-8
+
require 'thread'
require 'lockfile'
require 'mime/types'
require 'pathname'
require 'set'
require 'enumerator'
require 'benchmark'
-require 'iconv'
+require 'unicode'
## time for some monkeypatching!
class Symbol
unless method_defined? :to_proc
def to_proc
@@ -29,11 +31,11 @@
end
def dump_lock_id lock_id = @lock_id
"host: %s\npid: %s\nppid: %s\ntime: %s\nuser: %s\npname: %s\n" %
lock_id.values_at('host','pid','ppid','time','user', 'pname')
- end
+ end
def lockinfo_on_disk
h = load_lock_id IO.read(path)
h['mtime'] = File.mtime path
h['path'] = path
@@ -112,10 +114,29 @@
# end
end
end
class Header
+
+ # Convert to ASCII before trying to match with regexp
+ class Field
+
+ EXTRACT_FIELD_NAME_RE = /\A([^\x00-\x1f\x7f-\xff :]+):\s*/no
+
+ class << self
+ def parse(field)
+ field = field.dup.to_s
+ field = field.fix_encoding.ascii
+ if field =~ EXTRACT_FIELD_NAME_RE
+ [ $1, $'.chomp ]
+ else
+ [ "", Field.value_strip(field) ]
+ end
+ end
+ end
+ end
+
## Be more cautious about invalid content-type headers
## the original RMail code calls
## value.strip.split(/\s*;\s*/)[0].downcase
## without checking if split returned an element
@@ -232,18 +253,18 @@
ret
end
end
class String
- ## nasty multibyte hack for ruby 1.8. if it's utf-8, split into chars using
- ## the utf8 regex and count those. otherwise, use the byte length.
def display_length
- if RUBY_VERSION < '1.9.1' && ($encoding == "UTF-8" || $encoding == "utf8")
- # scan hack is somewhat slow, worth trying to cache
- @display_length ||= scan(/./u).size
- else
- size
+ @display_length ||= Unicode.width(self, false)
+ end
+
+ def slice_by_display_length len
+ each_char.each_with_object "" do |c, buffer|
+ len -= c.display_length
+ buffer << c if len >= 0
end
end
def camel_to_hyphy
self.gsub(/([a-z])([A-Z0-9])/, '\1-\2').downcase
@@ -326,24 +347,73 @@
end
def wrap len
ret = []
s = self
- while s.length > len
- cut = s[0 ... len].rindex(/\s/)
+ while s.display_length > len
+ cut = s.slice_by_display_length(len).rindex(/\s/)
if cut
ret << s[0 ... cut]
s = s[(cut + 1) .. -1]
else
- ret << s[0 ... len]
- s = s[len .. -1]
+ ret << s.slice_by_display_length(len)
+ s = s[ret.last.length .. -1]
end
end
ret << s
end
+ # Fix the damn string! make sure it is valid utf-8, then convert to
+ # user encoding.
+ #
+ # Not Ruby 1.8 compatible
+ def fix_encoding
+ # first try to encode to utf-8 from whatever current encoding
+ encode!('UTF-8', :invalid => :replace, :undef => :replace)
+
+ # do this anyway in case string is set to be UTF-8, encoding to
+ # something else (UTF-16 which can fully represent UTF-8) and back
+ # ensures invalid chars are replaced.
+ encode!('UTF-16', 'UTF-8', :invalid => :replace, :undef => :replace)
+ encode!('UTF-8', 'UTF-16', :invalid => :replace, :undef => :replace)
+
+ fail "Could not create valid UTF-8 string out of: '#{self.to_s}'." unless valid_encoding?
+
+ # now convert to $encoding
+ encode!($encoding, :invalid => :replace, :undef => :replace)
+
+ fail "Could not create valid #{$encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?
+
+ self
+ end
+
+ # transcode the string if original encoding is know
+ # fix if broken.
+ #
+ # Not Ruby 1.8 compatible
+ def transcode to_encoding, from_encoding
+ begin
+ encode!(to_encoding, from_encoding, :invalid => :replace, :undef => :replace)
+
+ unless valid_encoding?
+ # fix encoding (through UTF-8)
+ encode!('UTF-16', from_encoding, :invalid => :replace, :undef => :replace)
+ encode!(to_encoding, 'UTF-16', :invalid => :replace, :undef => :replace)
+ end
+
+ rescue Encoding::ConverterNotFoundError
+ debug "Encoding converter not found for #{from_encoding.inspect} or #{to_encoding.inspect}, fixing string: '#{self.to_s}', but expect weird characters."
+ fix_encoding
+ end
+
+ fail "Could not create valid #{to_encoding.inspect} string out of: '#{self.to_s}'." unless valid_encoding?
+
+ self
+ end
+
def normalize_whitespace
+ fix_encoding
gsub(/\t/, " ").gsub(/\r/, "")
end
unless method_defined? :ord
def ord
@@ -381,18 +451,14 @@
out << "\\x#{b.to_s 16}"
else
out << b.chr
end
end
- out.force_encoding Encoding::UTF_8 if out.respond_to? :force_encoding
- out
+ out = out.fix_encoding # this should now be an utf-8 string of ascii
+ # compat chars.
end
- def transcode src_encoding=$encoding
- Iconv.easy_decode $encoding, src_encoding, self
- end
-
unless method_defined? :ascii_only?
def ascii_only?
size.times { |i| return false if self[i] & 128 != 0 }
return true
end
@@ -657,29 +723,5 @@
def winner?
@m.synchronize { !@over && @over = true }
end
end
-class Iconv
- def self.easy_decode target, orig_charset, text
- if text.respond_to? :force_encoding
- text = text.dup
- text.force_encoding Encoding::BINARY
- end
- charset = case orig_charset
- when /UTF[-_ ]?8/i then "utf-8"
- when /(iso[-_ ])?latin[-_ ]?1$/i then "ISO-8859-1"
- when /iso[-_ ]?8859[-_ ]?15/i then 'ISO-8859-15'
- when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i then "utf-7"
- when /^euc$/i then 'EUC-JP' # XXX try them all?
- when /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i then 'ASCII'
- else orig_charset
- end
-
- begin
- returning(Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2]) { |str| str.check }
- rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence, String::CheckError
- debug "couldn't transcode text from #{orig_charset} (#{charset}) to #{target} (#{text[0 ... 20].inspect}...): got #{$!.class} (#{$!.message})"
- text.ascii
- end
- end
-end