lib/sup/util.rb in sup-0.9.1 vs lib/sup/util.rb in sup-0.10

- old
+ new

@@ -175,11 +175,11 @@ class String ## nasty multibyte hack for ruby 1.8. if it's utf-8, split into chars using ## the utf8 regex and count those. otherwise, use the byte length. def display_length - if $encoding == "UTF-8" || $encoding == "utf8" + if RUBY_VERSION < '1.9.1' && ($encoding == "UTF-8" || $encoding == "utf8") scan(/./u).size else size end end @@ -288,16 +288,49 @@ def ord self[0] end end + unless method_defined? :each + def each &b + each_line &b + end + end + ## takes a list of words, and returns an array of symbols. typically used in ## Sup for translating Ferret's representation of a list of labels (a string) ## to an array of label symbols. ## ## split_on will be passed to String#split, so you can leave this nil for space. def to_set_of_symbols split_on=nil; Set.new split(split_on).map { |x| x.strip.intern } end + + class CheckError < ArgumentError; end + def check + begin + fail "unexpected encoding #{encoding}" if respond_to?(:encoding) && !(encoding == Encoding::UTF_8 || encoding == Encoding::ASCII) + fail "invalid encoding" if respond_to?(:valid_encoding?) && !valid_encoding? + rescue + raise CheckError.new($!.message) + end + end + + def ascii + out = "" + each_byte do |b| + if (b & 128) != 0 + out << "\\x#{b.to_s 16}" + else + out << b.chr + end + end + out.force_encoding Encoding::UTF_8 if out.respond_to? :force_encoding + out + end + + def transcode src_encoding=$encoding + Iconv.easy_decode $encoding, src_encoding, self + end end class Numeric def clamp min, max if self < min @@ -484,13 +517,13 @@ strftime "%b %Y" elsif month != from.month strftime "%b %e" else if is_the_same_day? from - strftime("%l:%M%P") + strftime("%l:%M%p").downcase # emulate %P (missing on ruby 1.8 darwin) elsif is_the_day_before? from - "Yest." + nearest_hour.strftime("%l%P") + "Yest." + nearest_hour.strftime("%l%p").downcase # emulate %P else strftime "%b %e" end end end @@ -639,23 +672,28 @@ @m.synchronize { !@over && @over = true } end end class Iconv - def self.easy_decode target, charset, text - return text if charset =~ /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i - charset = case charset + def self.easy_decode target, orig_charset, text + if text.respond_to? :force_encoding + text = text.dup + text.force_encoding Encoding::BINARY + end + charset = case orig_charset when /UTF[-_ ]?8/i then "utf-8" when /(iso[-_ ])?latin[-_ ]?1$/i then "ISO-8859-1" when /iso[-_ ]?8859[-_ ]?15/i then 'ISO-8859-15' when /unicode[-_ ]1[-_ ]1[-_ ]utf[-_]7/i then "utf-7" - else charset + when /^euc$/i then 'EUC-JP' # XXX try them all? + when /^(x-unknown|unknown[-_ ]?8bit|ascii[-_ ]?7[-_ ]?bit)$/i then 'ASCII' + else orig_charset end begin - Iconv.iconv(target + "//IGNORE", charset, text + " ").join[0 .. -2] - rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence => e - warn "couldn't transcode text from #{charset} to #{target} (\"#{text[0 ... 20]}\"...) (got #{e.message}); using original as is" - text + returning(Iconv.iconv(target, charset, text + " ").join[0 .. -2]) { |str| str.check } + rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::InvalidCharacter, Iconv::IllegalSequence, String::CheckError + debug "couldn't transcode text from #{orig_charset} (#{charset}) to #{target}) (#{text[0 ... 20].inspect}...) (got #{$!.message} (#{$!.class}))" + text.ascii end end end