lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb in tmail-1.2.7 vs lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb in tmail-1.2.7.1

- old
+ new

@@ -49,118 +49,119 @@ @_mStart = true @_mGotData = false @_mInputState = EPureAscii @_mLastChar = '' if @_mEscCharSetProber - @_mEscCharSetProber.reset() + @_mEscCharSetProber.reset() end for prober in @_mCharSetProbers - prober.reset() + prober.reset() end end def feed(aBuf) return if @done aLen = aBuf.length return if not aLen if not @_mGotData - # If the data starts with BOM, we know it is UTF - if aBuf[0...3] == "\xEF\xBB\xBF" - # EF BB BF UTF-8 with BOM - @result = {'encoding' => "UTF-8", 'confidence' => 1.0} - elsif aBuf[0...4] == "\xFF\xFE\x00\x00" - # FF FE 00 00 UTF-32, little-endian BOM - @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0} - elsif aBuf[0...4] == "\x00\x00\xFE\xFF" - # 00 00 FE FF UTF-32, big-endian BOM - @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0} - elsif aBuf[0...4] == "\xFE\xFF\x00\x00" - # FE FF 00 00 UCS-4, unusual octet order BOM (3412) - @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0} - elsif aBuf[0...4] == "\x00\x00\xFF\xFE" - # 00 00 FF FE UCS-4, unusual octet order BOM (2143) - @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0} - elsif aBuf[0...2] == "\xFF\xFE" - # FF FE UTF-16, little endian BOM - @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0} - elsif aBuf[0...2] == "\xFE\xFF" - # FE FF UTF-16, big endian BOM - @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0} - end + # If the data starts with BOM, we know it is UTF + if aBuf[0...3] == "\xEF\xBB\xBF" + # EF BB BF UTF-8 with BOM + @result = {'encoding' => "UTF-8", 'confidence' => 1.0} + elsif aBuf[0...4] == "\xFF\xFE\x00\x00" + # FF FE 00 00 UTF-32, little-endian BOM + @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0} + elsif aBuf[0...4] == "\x00\x00\xFE\xFF" + # 00 00 FE FF UTF-32, big-endian BOM + @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0} + elsif aBuf[0...4] == "\xFE\xFF\x00\x00" + # FE FF 00 00 UCS-4, unusual octet order BOM (3412) + @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0} + elsif aBuf[0...4] == "\x00\x00\xFF\xFE" + # 00 00 FF FE UCS-4, unusual octet order BOM (2143) + @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0} + elsif aBuf[0...2] == "\xFF\xFE" + # FF FE UTF-16, little endian BOM + @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0} + elsif aBuf[0...2] == "\xFE\xFF" + # FE FF UTF-16, big endian BOM + @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0} + end end - + @_mGotData = true - if @result['encoding'] and (@result['confidence'] > 0.0): - @done = true - return + if @result['encoding'] and (@result['confidence'] > 0.0) + @done = true + return end - if @_mInputState == EPureAscii: - if @_highBitDetector =~ (aBuf) - @_mInputState = EHighbyte - elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf) - @_mInputState = EEscAscii - end + + if @_mInputState == EPureAscii + if @_highBitDetector =~ (aBuf) + @_mInputState = EHighbyte + elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf) + @_mInputState = EEscAscii + end end - + @_mLastChar = aBuf[-1..-1] if @_mInputState == EEscAscii - if not @_mEscCharSetProber - @_mEscCharSetProber = EscCharSetProber.new() - end - if @_mEscCharSetProber.feed(aBuf) == EFoundIt - @result = {'encoding' => self._mEscCharSetProber.get_charset_name(), - 'confidence' => @_mEscCharSetProber.get_confidence() - } - @done = true - end + if not @_mEscCharSetProber + @_mEscCharSetProber = EscCharSetProber.new() + end + if @_mEscCharSetProber.feed(aBuf) == EFoundIt + @result = {'encoding' => self._mEscCharSetProber.get_charset_name(), + 'confidence' => @_mEscCharSetProber.get_confidence() + } + @done = true + end elsif @_mInputState == EHighbyte - if not @_mCharSetProbers or @_mCharSetProbers.empty? - @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()] - end - for prober in @_mCharSetProbers - if prober.feed(aBuf) == EFoundIt - @result = {'encoding' => prober.get_charset_name(), - 'confidence' => prober.get_confidence()} - @done = true - break - end - end + if not @_mCharSetProbers or @_mCharSetProbers.empty? + @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()] + end + for prober in @_mCharSetProbers + if prober.feed(aBuf) == EFoundIt + @result = {'encoding' => prober.get_charset_name(), + 'confidence' => prober.get_confidence()} + @done = true + break + end + end end - + end - + def close return if @done if not @_mGotData - $stderr << "no data received!\n" if $debug - return + $stderr << "no data received!\n" if $debug + return end @done = true - - if @_mInputState == EPureAscii: - @result = {'encoding' => 'ascii', 'confidence' => 1.0} - return @result + + if @_mInputState == EPureAscii + @result = {'encoding' => 'ascii', 'confidence' => 1.0} + return @result end - - if @_mInputState == EHighbyte: - confidences = {} + + if @_mInputState == EHighbyte + confidences = {} @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence } - maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] } - if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD - @result = {'encoding' => maxProber.get_charset_name(), - 'confidence' => maxProber.get_confidence()} - return @result - end + maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] } + if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD + @result = {'encoding' => maxProber.get_charset_name(), + 'confidence' => maxProber.get_confidence()} + return @result + end end if $debug - $stderr << "no probers hit minimum threshhold\n" if $debug - for prober in @_mCharSetProbers[0]._mProbers - next if not prober - $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug - end + $stderr << "no probers hit minimum threshhold\n" if $debug + for prober in @_mCharSetProbers[0]._mProbers + next if not prober + $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug + end end end end end