lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb in tmail-1.2.7 vs lib/tmail/vendor/rchardet-1.3/lib/rchardet/universaldetector.rb in tmail-1.2.7.1
- old
+ new
@@ -49,118 +49,119 @@
@_mStart = true
@_mGotData = false
@_mInputState = EPureAscii
@_mLastChar = ''
if @_mEscCharSetProber
- @_mEscCharSetProber.reset()
+ @_mEscCharSetProber.reset()
end
for prober in @_mCharSetProbers
- prober.reset()
+ prober.reset()
end
end
def feed(aBuf)
return if @done
aLen = aBuf.length
return if not aLen
if not @_mGotData
- # If the data starts with BOM, we know it is UTF
- if aBuf[0...3] == "\xEF\xBB\xBF"
- # EF BB BF UTF-8 with BOM
- @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
- elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
- # FF FE 00 00 UTF-32, little-endian BOM
- @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
- elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
- # 00 00 FE FF UTF-32, big-endian BOM
- @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
- elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
- # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
- @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
- elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
- # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
- @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
- elsif aBuf[0...2] == "\xFF\xFE"
- # FF FE UTF-16, little endian BOM
- @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
- elsif aBuf[0...2] == "\xFE\xFF"
- # FE FF UTF-16, big endian BOM
- @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
- end
+ # If the data starts with BOM, we know it is UTF
+ if aBuf[0...3] == "\xEF\xBB\xBF"
+ # EF BB BF UTF-8 with BOM
+ @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
+ # FF FE 00 00 UTF-32, little-endian BOM
+ @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
+ # 00 00 FE FF UTF-32, big-endian BOM
+ @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
+ @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
+ @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
+ elsif aBuf[0...2] == "\xFF\xFE"
+ # FF FE UTF-16, little endian BOM
+ @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
+ elsif aBuf[0...2] == "\xFE\xFF"
+ # FE FF UTF-16, big endian BOM
+ @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
+ end
end
-
+
@_mGotData = true
- if @result['encoding'] and (@result['confidence'] > 0.0):
- @done = true
- return
+ if @result['encoding'] and (@result['confidence'] > 0.0)
+ @done = true
+ return
end
- if @_mInputState == EPureAscii:
- if @_highBitDetector =~ (aBuf)
- @_mInputState = EHighbyte
- elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
- @_mInputState = EEscAscii
- end
+
+ if @_mInputState == EPureAscii
+ if @_highBitDetector =~ (aBuf)
+ @_mInputState = EHighbyte
+ elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
+ @_mInputState = EEscAscii
+ end
end
-
+
@_mLastChar = aBuf[-1..-1]
if @_mInputState == EEscAscii
- if not @_mEscCharSetProber
- @_mEscCharSetProber = EscCharSetProber.new()
- end
- if @_mEscCharSetProber.feed(aBuf) == EFoundIt
- @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
- 'confidence' => @_mEscCharSetProber.get_confidence()
- }
- @done = true
- end
+ if not @_mEscCharSetProber
+ @_mEscCharSetProber = EscCharSetProber.new()
+ end
+ if @_mEscCharSetProber.feed(aBuf) == EFoundIt
+ @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
+ 'confidence' => @_mEscCharSetProber.get_confidence()
+ }
+ @done = true
+ end
elsif @_mInputState == EHighbyte
- if not @_mCharSetProbers or @_mCharSetProbers.empty?
- @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
- end
- for prober in @_mCharSetProbers
- if prober.feed(aBuf) == EFoundIt
- @result = {'encoding' => prober.get_charset_name(),
- 'confidence' => prober.get_confidence()}
- @done = true
- break
- end
- end
+ if not @_mCharSetProbers or @_mCharSetProbers.empty?
+ @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
+ end
+ for prober in @_mCharSetProbers
+ if prober.feed(aBuf) == EFoundIt
+ @result = {'encoding' => prober.get_charset_name(),
+ 'confidence' => prober.get_confidence()}
+ @done = true
+ break
+ end
+ end
end
-
+
end
-
+
def close
return if @done
if not @_mGotData
- $stderr << "no data received!\n" if $debug
- return
+ $stderr << "no data received!\n" if $debug
+ return
end
@done = true
-
- if @_mInputState == EPureAscii:
- @result = {'encoding' => 'ascii', 'confidence' => 1.0}
- return @result
+
+ if @_mInputState == EPureAscii
+ @result = {'encoding' => 'ascii', 'confidence' => 1.0}
+ return @result
end
-
- if @_mInputState == EHighbyte:
- confidences = {}
+
+ if @_mInputState == EHighbyte
+ confidences = {}
@_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
- maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
- if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
- @result = {'encoding' => maxProber.get_charset_name(),
- 'confidence' => maxProber.get_confidence()}
- return @result
- end
+ maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
+ if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
+ @result = {'encoding' => maxProber.get_charset_name(),
+ 'confidence' => maxProber.get_confidence()}
+ return @result
+ end
end
if $debug
- $stderr << "no probers hit minimum threshhold\n" if $debug
- for prober in @_mCharSetProbers[0]._mProbers
- next if not prober
- $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
- end
+ $stderr << "no probers hit minimum threshhold\n" if $debug
+ for prober in @_mCharSetProbers[0]._mProbers
+ next if not prober
+ $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
+ end
end
end
end
end