Sha256: 5f0d192c5743ef0fc827d628a01005bad75467e0d607c0f9a96314e08fbc67dc

Contents?: true

Size: 741 Bytes

Versions: 17

Compression:

Stored size: 741 Bytes

Contents

module Embulk
  module Guess

    class CharsetGuessPlugin < GuessPlugin
      Plugin.register_guess('charset', self)

      def guess(config, sample_buffer)
        # ICU4J
        detector = com.ibm.icu.text.CharsetDetector.new
        detector.setText(sample_buffer.to_java_bytes)
        best_match = detector.detect
        if best_match.getConfidence < 50
          name = "UTF-8"
        else
          name = best_match.getName
          if name == "ISO-8859-1"
            # ISO-8859-1 means ASCII which is a subset
            # of UTF-8 in most of cases due to lack of
            # sample data set
            name = "UTF-8"
          end
        end
        return {"parser" => {"charset" => name}}
      end
    end

  end
end

Version data entries

17 entries across 17 versions & 1 rubygems

Version Path
embulk-0.6.1 lib/embulk/guess/charset.rb
embulk-0.6.0 lib/embulk/guess/charset.rb
embulk-0.5.5 lib/embulk/guess/charset.rb
embulk-0.5.4 lib/embulk/guess/charset.rb
embulk-0.5.3 lib/embulk/guess/charset.rb
embulk-0.5.2 lib/embulk/guess/charset.rb
embulk-0.5.1 lib/embulk/guess/charset.rb
embulk-0.5.0 lib/embulk/guess/charset.rb
embulk-0.4.10 lib/embulk/guess/charset.rb
embulk-0.4.9 lib/embulk/guess/charset.rb
embulk-0.4.8 lib/embulk/guess/charset.rb
embulk-0.4.7 lib/embulk/guess/charset.rb
embulk-0.4.6 lib/embulk/guess/charset.rb
embulk-0.4.5 lib/embulk/guess/charset.rb
embulk-0.4.4 lib/embulk/guess/charset.rb
embulk-0.4.3 lib/embulk/guess/charset.rb
embulk-0.4.2 lib/embulk/guess/charset.rb